Migrate amdgpu-windows-interop to rocm-systems (#808)

Dieser Commit ist enthalten in:
Joseph Macaranas
2025-09-05 10:32:44 -04:00
committet von GitHub
Ursprung 3d9d35a1f8
Commit 5ca7af2d30
261 geänderte Dateien mit 86831 neuen und 2 gelöschten Zeilen
@@ -0,0 +1,833 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file pal.h
* @brief Common include for the Platform Abstraction Library (PAL) interface. Defines common types, enums, etc.
***********************************************************************************************************************
*/
#pragma once
#include "palFormat.h"
#include "palSysUtil.h"
// Forward declarations of global types (must be done outside of Pal namespace).
#if (PAL_KMT_BUILD) && !defined(__unix__)
struct HMONITOR__;
struct HWND__;
#endif
#if PAL_KMT_BUILD
struct _SECURITY_ATTRIBUTES;
#endif
/// Library-wide namespace encapsulating all PAL entities.
namespace Pal
{
typedef Util::int8 int8; ///< 8-bit integer.
typedef Util::int16 int16; ///< 16-bit integer.
typedef Util::int32 int32; ///< 32-bit integer.
typedef Util::int64 int64; ///< 64-bit integer.
typedef Util::uint8 uint8; ///< Unsigned 8-bit integer.
typedef Util::uint16 uint16; ///< Unsigned 16-bit integer.
typedef Util::uint32 uint32; ///< Unsigned 32-bit integer.
typedef Util::uint64 uint64; ///< Unsigned 64-bit integer.
typedef Util::gpusize gpusize; ///< Used to specify GPU addresses and sizes of GPU allocations. This differs from
/// size_t since the GPU still uses 64-bit addresses on a 32-bit OS.
typedef Util::Result Result; ///< The PAL core and utility companion share the same result codes for convenience.
typedef Util::Rational Rational; ///< A ratio of two unsigned integers.
#if defined(_WIN32)
typedef HMONITOR__* OsDisplayHandle; ///< OsDisplayHandle corresponds to an HMONITOR on Windows.
typedef HWND__* OsWindowHandle; ///< OsWindowHandle corresponds to an HWND on Windows.
typedef void* OsExternalHandle; ///< OsExternalHandle corresponds to a generic HANDLE on Windows
typedef uint32 OsVideoSessionHandle; ///< OsVideoSessionHandle corresponds to a video session handle on Vulkan.
constexpr OsWindowHandle NullWindowHandle = nullptr; ///< Value representing a null or invalid window handle.
#elif defined(__unix__)
typedef void* OsDisplayHandle; ///< The Display Handle for Linux except X11 platform
typedef uint32 OsExternalHandle; ///< OsExternalHandle corresponds to a generic handle on linux
typedef uint32 OsVideoSessionHandle; ///< OsVideoSessionHandle corresponds to a video session handle on linux.
/// OsWindowHandle corresponds to a window on X-Windows or surface on Wayland.
union OsWindowHandle
{
void* pSurface; ///< Native surface handle in wayland is a pointer.
uint64 win; ///< Native window handle in X is a 32-bit integer (but stored here as 64 bit).
};
constexpr OsWindowHandle NullWindowHandle = {nullptr}; ///< Value representing a null or invalid window handle.
// don't check for the Linux Platform type; just compare the larger member of the union
inline bool operator==(const Pal::OsWindowHandle& lhs, const Pal::OsWindowHandle& rhs)
{ return (lhs.pSurface == rhs.pSurface); }
inline bool operator!=(const Pal::OsWindowHandle& lhs, const Pal::OsWindowHandle& rhs)
{ return (lhs.pSurface != rhs.pSurface); }
#else
#error "Unsupported OS platform detected!"
#endif
#if PAL_CLIENT_EXAMPLE
typedef void* AddrHandle; ///< Corresponds to an ADDR_HANDLE.
#endif
constexpr uint32 InvalidVidPnSourceId = ~0u; ///< In cases where PAL cannot abstract a Windows VidPnSourceId, this
/// represents an invalid value. (Note: zero is a valid value.)
constexpr uint32 MaxVertexBuffers = 32; ///< Maximum number of vertex buffers per pipeline.
constexpr uint32 MaxColorTargets = 8; ///< Maximum number of color targets.
constexpr uint32 MaxStreamOutTargets = 4; ///< Maximum number of stream output target buffers.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936
constexpr uint32 MaxDescriptorSets = 2; ///< Maximum number of descriptor sets.
#endif
constexpr uint32 MaxMsaaRasterizerSamples = 16; ///< Maximum number of MSAA samples supported by the rasterizer.
constexpr uint32 MaxAvailableEngines = 12; ///< Maximum number of engines for a particular engine type.
constexpr uint32 MaxNumPlanes = 3; ///< Maximum number of format planes.
constexpr uint64 InternalApiPsoHash = UINT64_MAX; ///< Default Hash for PAL internal pipelines.
/// Specifies a category of GPU engine. Each category corresponds directly to a hardware engine. There may be multiple
/// engines available for a given type; the available engines on a particular GPU can be queried via
/// Device::GetProperties, returned in DeviceProperties.engineProperties[].
enum EngineType : uint32
{
/// Corresponds to the graphics hardware engine (a.k.a. graphcis ring a.k.a 3D).
EngineTypeUniversal,
/// Corresponds to asynchronous compute engines (ACE).
EngineTypeCompute,
/// Corresponds to SDMA engines.
EngineTypeDma,
/// Virtual engine that only supports inserting sleeps, used for implementing frame-pacing.
EngineTypeTimer,
/// Number of engine types.
EngineTypeCount,
};
/// Specifies a category of GPU work. Each queue type only supports specific types of work. Determining which
/// QueueTypes are supported on which engines can be queried via IDevice::GetProperties, returned in
/// DeviceProperties.engineProperties[].
enum QueueType : uint32
{
/// Supports graphics commands (draws), compute commands (dispatches), and copy commands.
QueueTypeUniversal,
/// Supports compute commands (dispatches), and copy commands.
QueueTypeCompute,
/// Supports copy commands.
QueueTypeDma,
/// Virtual engine that only supports inserting sleeps, used for implementing frame pacing.
/// This is a software-only queue.
QueueTypeTimer,
/// Number of queue types.
QueueTypeCount,
};
/// Defines flags for describing which queues are supported.
enum QueueTypeSupport : uint32
{
SupportQueueTypeUniversal = (1 << static_cast<uint32>(QueueTypeUniversal)),
SupportQueueTypeCompute = (1 << static_cast<uint32>(QueueTypeCompute)),
SupportQueueTypeDma = (1 << static_cast<uint32>(QueueTypeDma)),
SupportQueueTypeTimer = (1 << static_cast<uint32>(QueueTypeTimer)),
};
// Many command buffers break down into multiple command streams targeting internal sub-engines. For example, Universal
// command buffers build a primary stream (DE) but may also build a second stream for async compute engine (ACE).
enum class SubEngineType : uint32
{
Primary = 0, // Subqueue that is the queue itself, rather than an ancillary queue.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 914
AsyncCompute = 1, // Auxiliary ACE subqueue, together with a primary subqueue forms a "ganged" submit.
ConstantEngine = 2, // CP constant update engine that runs in parallel with draw engine.
// Internal usage only.
#else
ConstantEngine = 1, // CP constant update engine that runs in parallel with draw engine.
AsyncCompute = 2, // Auxiliary ACE subqueue, together with a primary subqueue forms a "ganged" submit.
Pup = 3, // Subqueue that is the queue itself but for PUP-style packets, rather than an
// ancillary queue
#endif
Count,
};
/// Defines the execution priority for a queue, specified either at queue creation or via IQueue::SetExecutionPriority()
/// on platforms that support it. QueuePriority::Normal corresponds to the default priority.
enum class QueuePriority : uint32
{
Normal = 0, ///< Normal priority (default).
Idle = 1, ///< Idle, or low priority (lower than Normal).
Medium = 2, ///< Medium priority (higher than Normal).
High = 3, ///< High priority (higher than Normal).
Realtime = 4, ///< Real time priority (higher than Normal).
Count
};
/// Defines flags for describing which queue priority levels are supported.
enum QueuePrioritySupport : uint32
{
SupportQueuePriorityNormal = (1 << static_cast<uint32>(QueuePriority::Normal)),
SupportQueuePriorityIdle = (1 << static_cast<uint32>(QueuePriority::Idle)),
SupportQueuePriorityMedium = (1 << static_cast<uint32>(QueuePriority::Medium)),
SupportQueuePriorityHigh = (1 << static_cast<uint32>(QueuePriority::High)),
SupportQueuePriorityRealtime = (1 << static_cast<uint32>(QueuePriority::Realtime)),
};
/// Selects one of a few possible memory heaps accessible by a GPU.
enum GpuHeap : uint32
{
GpuHeapLocal = 0x0, ///< Local heap visible to the CPU.
GpuHeapInvisible = 0x1, ///< Local heap not visible to the CPU.
GpuHeapGartUswc = 0x2, ///< GPU-accessible uncached system memory.
GpuHeapGartCacheable = 0x3, ///< GPU-accessible cached system memory.
GpuHeapCount
};
/// Describes the desired access for a memory allocation.
enum GpuHeapAccess : uint32
{
GpuHeapAccessExplicit = 0x0, ///< Memory access is not known. Heaps will be explicitly defined.
GpuHeapAccessCpuNoAccess = 0x1, ///< Memory access from CPU not required.
GpuHeapAccessGpuMostly = 0x2, ///< Memory optimized for reads/writes from GPU and accessible from CPU.
GpuHeapAccessCpuReadMostly = 0x3, ///< Memory optimized for reads from CPU.
GpuHeapAccessCpuWriteMostly = 0x4, ///< Memory optimized for writes from CPU.
GpuHeapAccessCpuMostly = 0x5, ///< Memory optimized for read/writes from CPU.
GpuHeapAccessCount
};
#if defined(__unix__)
/// Describes possible handle types.
enum class HandleType : uint32
{
GemFlinkName = 0x0, ///< GEM flink name (needs DRM authentication, used by DRI2)
Kms = 0x1, ///< KMS handle which is used by all driver ioctls
DmaBufFd = 0x2, ///< DMA-buf fd handle
KmsNoImport = 0x3, ///< Deprecated in favour of and same behaviour as HandleTypeDmaBufFd, use that instead of this
};
#endif
/// Comparison function determines how a pass/fail condition is determined between two values. For depth/stencil
/// comparison, the first value comes from source data and the second value comes from destination data.
enum class CompareFunc : uint8
{
Never = 0x0,
Less = 0x1,
Equal = 0x2,
LessEqual = 0x3,
Greater = 0x4,
NotEqual = 0x5,
GreaterEqual = 0x6,
_Always = 0x7,
// Unfortunately for Linux clients, X.h includes a "#define Always 2" macro. Clients have their choice of either
// undefing Always before including this header or using _Always when dealing with PAL.
#ifndef Always
Always = _Always,
#endif
Count
};
/// Defines an offset into a 2D pixel region.
struct Offset2d
{
int32 x; ///< X offset.
int32 y; ///< Y offset.
};
/// Defines an offset into a 3D pixel region.
struct Offset3d
{
int32 x; ///< X offset.
int32 y; ///< Y offset.
int32 z; ///< Z offset.
};
/// Defines an floating-point offset into a 3D pixel region.
struct Offset3dFloat
{
float x; ///< X offset.
float y; ///< Y offset.
float z; ///< Z offset.
};
/// Defines a width and height for a 2D image region. The dimensions could be pixels, blocks, or bytes
/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you
/// get it right.
struct Extent2d
{
uint32 width; ///< Width of region.
uint32 height; ///< Height of region.
};
/// Defines a signed width and height, for a 2D image region. The dimensions could be pixels, blocks, or bytes
/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you
/// get it right.
struct SignedExtent2d
{
int32 width; ///< Width of region.
int32 height; ///< Height of region.
};
/// Defines a width, height, and depth for a 3D image region. The dimensions could be pixels, blocks, or bytes
/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you
/// get it right.
struct Extent3d
{
uint32 width; ///< Width of region.
uint32 height; ///< Height of region.
uint32 depth; ///< Depth of region.
};
constexpr bool operator==(const Extent3d& x, const Extent3d& y)
{
return (x.width == y.width) && (x.height == y.height) && (x.depth == y.depth);
}
constexpr bool operator!=(const Extent3d& x, const Extent3d& y) { return (x == y) == false; }
/// Defines a signed width, height, and depth for a 3D image region. The dimensions could be pixels, blocks, or bytes
/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you
/// get it right.
struct SignedExtent3d
{
int32 width; ///< Width of region.
int32 height; ///< Height of region.
int32 depth; ///< Depth of region.
};
/// Defines a floating-point width, height, and depth for a 3D image region. The dimensions could be pixels, blocks, or
/// bytes depending on context, so be sure to check documentation for the PAL interface of interest to be sure you
/// get it right.
struct Extent3dFloat
{
float width; ///< Width of region.
float height; ///< Height of region.
float depth; ///< Depth of region.
};
/// Defines a region in 1D space.
struct Range
{
int32 offset; ///< Starting position.
uint32 extent; ///< Region size.
};
/// Defines a rectangular region in 2D space.
struct Rect
{
Offset2d offset; ///< Top left corner.
Extent2d extent; ///< Rectangle width and height.
};
/// Defines a cubic region in 3D space.
struct Box
{
Offset3d offset; ///< Top left front corner.
Extent3d extent; ///< Box width, height and depth.
};
/// ShaderHash represents a 128-bit shader hash.
struct ShaderHash
{
uint64 lower; ///< Lower 64-bits of hash
uint64 upper; ///< Upper 64-bits of hash
};
/// PipelineHash represents a concatenated pair of 64-bit hashes.
struct PipelineHash
{
uint64 stable; ///< Lower 64-bits of hash. "Stable" portion, suitable for e.g. shader replacement use cases.
uint64 unique; ///< Upper 64-bits of hash. "Unique" portion, suitable for e.g. pipeline cache use cases.
};
/// Common shader pre and post compilation stats.
struct CommonShaderStats
{
uint32 numUsedVgprs; ///< Number of VGPRs used by this shader
uint32 numUsedSgprs; ///< Number of SGPRs used by this shader
uint32 ldsSizePerThreadGroup; ///< LDS size per thread group in bytes.
size_t ldsUsageSizeInBytes; ///< LDS usage by this shader.
size_t scratchMemUsageInBytes; ///< Amount of scratch mem used by this shader.
gpusize gpuVirtAddress; ///< Gpu mem address of shader ISA code.
union
{
struct
{
uint32 isWave32 : 1; ///< If set, specifies that the shader is compiled in wave32 mode.
uint32 reserved : 31; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as a 32-bit uint.
} flags; ///< Shader compilation stat flags.
};
/// Per-thread stack sizes
struct CompilerStackSizes
{
uint32 backendSize; ///< Managed by compiler backend
uint32 frontendSize; ///< Managed by compiler frontend
};
///@{
/// Determines whether two ShaderHashes or PipelineHashes are equal.
///
/// @param [in] hash1 The first 128-bit shader hash or pipeline hash
/// @param [in] hash2 The second 128-bit shader hash or pipeline hash
///
/// @returns True if the hashes are equal.
constexpr bool ShaderHashesEqual(const ShaderHash hash1, const ShaderHash hash2)
{ return ((hash1.lower == hash2.lower) && (hash1.upper == hash2.upper)); }
constexpr bool operator==(const ShaderHash hash1, const ShaderHash hash2)
{ return ((hash1.lower == hash2.lower) && (hash1.upper == hash2.upper)); }
constexpr bool operator!=(const ShaderHash hash1, const ShaderHash hash2)
{ return ((hash1.lower != hash2.lower) || (hash1.upper != hash2.upper)); }
constexpr bool PipelineHashesEqual(const PipelineHash hash1, const PipelineHash hash2)
{ return ((hash1.stable == hash2.stable) && (hash1.unique == hash2.unique)); }
///@}
///@{
/// Determines whether the given ShaderHash or PipelineHash is non-zero.
///
/// @param [in] hash A 128-bit shader hash or pipeline hash
///
/// @returns True if the hash is non-zero.
constexpr bool ShaderHashIsNonzero(const ShaderHash hash) { return ((hash.upper | hash.lower) != 0); }
constexpr bool PipelineHashIsNonzero(const PipelineHash hash) { return ((hash.stable | hash.unique) != 0); }
///@}
/// Specifies the Display Output Post-Processing (DOPP) desktop texture information, which are provided by OpenGL via
/// interop. The DOPP is an OpenGL extension to allow its client to access the desktop texture directly without the
/// need of copying to system memory. This is only supported on Windows.
struct DoppDesktopInfo
{
gpusize gpuVirtAddr; ///< The VA of the dopp desktop texture. Set to 0 for the non-dopp resource.
uint32 vidPnSourceId; ///< Display source id of the dopp desktop texture.
};
/// Specifies the Direct Capture resource information. Direct Capture is an extension that allows to access on-screen
/// primary, motion vectors, depth, and camera matrix directly. This is only supported on Windows.
struct DirectCaptureInfo
{
uint32 vidPnSourceId; ///< VidPnSource ID of the on-screen primary.
union
{
struct
{
uint32 preflip : 1; ///< Requires pre-flip primary access
uint32 postflip : 1; ///< Requires post-flip primary access. A DirectCapture resource cannot
/// have pre-flip and post-flip access at the same time
uint32 accessDesktop : 1; ///< Requires acces to the desktop
uint32 shared : 1; ///< This resource will be shared between APIs
uint32 frameGenRatio : 4; ///< Frame generation ratio
uint32 paceGeneratedFrame : 1; ///< Requires pacing the generated frames
uint32 requiresDisplayDcc : 1; ///< Requires display dcc support
uint32 requestMotionVectors : 1; ///< Request DirectCapture access to motion vector data if available
uint32 requestDepth : 1; ///< Request DirectCapture access to depth data if available
uint32 requestCamera : 1; ///< Request DirectCapture access to camera matrix data if available
uint32 initMotionVectors : 1; ///< Initialize the DirectCapture resource to access motion vector data
uint32 initDepth : 1; ///< Initialize the DirectCapture resource to access depth data
uint32 initCamera : 1; ///< Initialize the DirectCapture resource to access camera matrix
uint32 requestHudLessImage : 1; ///< Request DirectCapture access to HUD less image if available
uint32 initHudLessImage : 1; ///< Initialize the DirectCapture resource to access HUD less image
uint32 reserved : 14;
};
uint32 u32All;
} usageFlags;
OsExternalHandle hNewFrameEvent; ///< Event to notify of a new frame available for pre-flip or post-flip access
OsExternalHandle hFatalErrorEvent; ///< Event to notify of a fatal error
};
/// Specifies parameters for opening a shared GPU resource from a non-PAL device or non-local process.
struct ExternalResourceOpenInfo
{
OsExternalHandle hExternalResource; ///< External GPU resource from another non-PAL device to open.
#if defined(__unix__)
HandleType handleType; ///< Type of the external GPU resource to be opened.
#endif
union
{
struct
{
uint32 ntHandle : 1; ///< The provided hExternalResource is an NT handle instead of a default
/// KMT handle.
uint32 androidHwBufHandle : 1; ///< The provided hExternalResource is android hardware buffer handle
/// instead of fd.
uint32 isDopp : 1; ///< This is a Dopp texture, doppDesktopInfo is in use.
uint32 isDirectCapture : 1; ///< This is a Direct Capture resource, directCaptureInfo is in use.
uint32 globalGpuVa : 1; ///< The GPU virtual address must be visible to all devices.
uint32 reserved : 27; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< External resource open flags.
union
{
DoppDesktopInfo doppDesktopInfo; ///< The information of dopp desktop texture.
DirectCaptureInfo directCaptureInfo; ///< The information of direct capture resource.
};
};
/// Packed pixel display enumeration.
///
/// In the medical imaging market space, there are several 10-bit per component color and grayscale displays
/// available.In addition to being high precision, these displays tend to be very high resolution.For grayscale
/// displays,one method of getting high pixel resolution in 10b precision is a proprietary method called
/// "packed pixel".Each of these packed pixel formats packs two/three 10-bit luminance values into a single
/// R8G8B8 pixel.
///
/// Example Displays:
///
/// EIZO GS510
/// NEC MD21GS
/// TOTOKU ME55Xi2
/// FIMI 3/5MP
///
///
/// The enumerations are named in a way to describe the format of the packed pixels. Names for
/// formats with two or three pixels packed into a single word (corresponding to a simple RGB pixel)
/// follow this convention:
///
/// LLLLLL_RRRRRR (L=left pixel, R=right pixel) or
/// LLL_MMM_RRR (L=left pixel, M=middle pixel, R=right pixel)
///
/// The bit order for a pixel follows this convention:
///
/// (ColorBand)MSB(ColorBand)LSB
///
/// For example: G70B54 means that the MSBs are in 7-0 of the green channel, and the LSBs
/// are stored in bits 5-4.
///
enum class PackedPixelType : uint32
{
NotPacked = 0, ///< Pixels not packed, for standard color RGB8 monitor
SplitG70B54_R70B10, ///< 10-bit mono, split screen
SplitB70G10_R70G76, ///< 10-bit mono, split screen
G70B54_R70B10, ///< 10-bit mono, 2 adjacent pixels
B70R32_G70R76, ///< 10-bit mono, 2 adjacent pixels
B70R30_G70R74, ///< 12-bit mono, 2 adjacent pixels
B70_G70_R70, ///< 8-bit mono, 3 adjacent pixels
R70G76, ///< 10-bit mono, single pixel
G70B54, ///< 10-bit mono, single pixel
Native, ///< 10-bit color, without packing
};
/// Enumerates the logging priority levels supported by PAL.
enum class LogLevel : uint32
{
Debug = 0, ///< Debug messages
Verbose, ///< High frequency messages
Info, ///< Low frequency messages
Alert, ///< Warnings
Error, ///< Critical issues
Always ///< All messages
};
/// Enumerates all log categories explicitly defined by PAL
enum class LogCategory : uint64
{
Correctness = 0, ///< Application correctness
Performance, ///< Application performance
Internal, ///< Internal logging
Display, ///< Display Info
Count
};
/// String table used to register log categories
constexpr const char* LogCategoryTable[] =
{
"Correctness",
"Performance",
"Internal",
"Display"
};
/// Typedef for log category masks.
typedef uint64 LogCategoryMask;
/// Log category mask for messages related to application correctness
constexpr LogCategoryMask LogCategoryMaskCorrectness = (1 << static_cast<uint32>(LogCategory::Correctness));
/// Log category mask for messages related to application performance
constexpr LogCategoryMask LogCategoryMaskPerformance = (1 << static_cast<uint32>(LogCategory::Performance));
/// Log category mask for messages related to internal messages
constexpr LogCategoryMask LogCategoryMaskInternal = (1 << static_cast<uint32>(LogCategory::Internal));
/// Log category mask for messages related to display information (e.g. HDR format)
constexpr LogCategoryMask LogCategoryMaskDisplay = (1 << static_cast<uint32>(LogCategory::Display));
/// Defines the modes that the GPU Profiling layer can be enabled with. If the GpuProfilerMode is
/// GpuProfilerTraceEnabledTtv or GpuProfilerTraceEnabledRgp, then the GpuProfilerConfig_TraceModeMask is examined to
/// configure the trace type (spm, sqtt or both) requested.
enum GpuProfilerMode : uint32
{
GpuProfilerDisabled = 0, ///< Gpu Profiler is disabled.
GpuProfilerCounterAndTimingOnly = 1, ///< Traces are disabled but perf counter and timing operations are enabled.
GpuProfilerTraceEnabledTtv = 2, ///< Traces are output in format (.csv, .out) for Thread trace viewer.
GpuProfilerTraceEnabledRgp = 3, ///< Trace data is output as .rgp file for Radeon Gpu Profiler.
};
// Defines the trigger keys for capturing the GPU profiler.
typedef Util::KeyCode GpuProfilerCaptureTriggerKey;
#define PAL_EVENT_LOGGING_VERSION 528
/// This enumeration identifies the source/owner of a resource object, used for event logging.
enum ResourceOwner : uint32
{
ResourceOwnerApplication = 0, ///< The resource is owned by the application
ResourceOwnerPalClient = 1, ///< The resource is owned by the PAL client
ResourceOwnerPal = 2, ///< The resource is owned by PAL
ResourceOwnerUnknown = 3, ///< The resource owner is unknown
};
/// This enumeration lists the usage/category of a resource object to give context in event logging.
enum ResourceCategory : uint32
{
ResourceCategoryApplication = 0, ///< The resource is used by the application.
ResourceCategoryRpm = 1, ///< The resource is used by RPM
ResourceCategoryProfiling = 2, ///< The resource is used for profiling (e.g. SQTT, SPM, etc)
ResourceCategoryDebug = 3, ///< The resource is used for debug purposes
ResourceCategoryRayTracing = 4, ///< The resource is used for ray tracing
ResourceCategoryVideo = 5, ///< The resource is used for video encode/decode
ResourceCategoryMisc = 6, ///< Miscellaneous, resource doesn't fit in any of the above categories
ResourceCategoryUnknown = 7, ///< The resource category is unknown
};
/// Set of information about resource ownership and usage, used for event logging.
struct ResourceEventInfo
{
ResourceOwner owner; ///< Resource owner
ResourceCategory category; ///< Resource category
};
/// General purpose on/off/default tri-state enum.
enum class TriState : uint8
{
Default = 0, ///< Let implementation decide whether to enable or disable
Enable = 1, ///< Force enable
Disable = 2, ///< Force disable
Count
};
/// Defines the modes that the GPU Profiling layer can be enabled with.
/**
***********************************************************************************************************************
* @mainpage
*
* Introduction
* ------------
* The Platform Abstraction Library (PAL) provides hardware and OS abstractions for Radeon (GCN+) user-mode 3D graphics
* drivers. The level of abstraction is chosen to support performant driver implementations of several APIs while
* hiding the client from hardware and operating system details.
*
* PAL client drivers will have no HW-specific code; their responsibility is to translate API/DDI commands into PAL
* commands as efficiently as possible. This means that the client should be unaware of hardware registers, PM4
* commands, SP3 shaders, etc. However, PAL is an abstraction of AMD hardware only, so many things in the PAL interface
* have an obvious correlation to hardware features.
*
* PAL client drivers should have little OS-specific code. PAL and its companion utility collection provide
* OS abstractions for almost everything a client might need, but there are some cases where this is unavoidable:
*
* + Handling dynamic library infrastructure. I.e., the client has to implement DllMain() on Windows, etc.
* + OS-specific APIs or extensions. DX may have Windows-specific functionality in the core API, and Vulkan may
* export certain OS-specific features as extensions (like for presenting contents to the screen).
* + Single OS clients (e.g., DX) may choose to make OS-specific calls directly simply out of convenience with no down
* side.
*
*
* The following diagram illustrates the software stack when running a 3D application with a PAL-based UMD. Non-AMD
* components are in gray, UMD client code is blue, AMD static libs linked into the UMD are green, and the AMD KMD
* is in red.
*
* @image html swStack.png
*
* PAL is a relatively _thick_ abstraction layer, typically accounting for the majority of code (excluding SC) in any
* particular UMD built on PAL. The level of abstraction tends to be higher in areas where client APIs are similar,
* and lower (closer to hardware) in areas where client APIs diverge significantly. The overall philosophy is to share
* as much code as possible without impacting client driver performance. Our committed goal is that CPU-limited
* performance should be within 5% of what a native solution could achieve, and GPU-limited performance should be within
* 2%.
*
* PAL uses a C++ interface. The public interface is defined in .../pal/inc, and client must _only_ include headers
* from that directory. The interface is spread over many header files - typically one per class - in order to clarify
* dependencies and reduce build times. There are two sub-directories in .../pal/inc:
*
* + <b>.../pal/inc/core</b> - Defines the PAL Core (see @ref Overview).
* + <b>.../pal/inc/gpuUtil</b> - Defines the PAL GPU Utility Collection (see @ref GpuUtilOverview).
* + <b>.../pal/inc/util</b> - Defines the PAL Utility Collection (see @ref UtilOverview).
*
*
* @copydoc VersionHistory
*
* Next: @ref Build
***********************************************************************************************************************
*/
/**
***********************************************************************************************************************
* @page Overview PAL Core Overview
*
* ### Introduction
* PAL's core interface is defined in the @ref Pal namespace, and defines an object-oriented model for interacting with
* the GPU and OS. The interface closely resembles the Vulkan and DX12 APIs. Some common features of these
* APIs that are central to the PAL interface:
*
* - All shader stages, and some additional "shader adjacent" state, are glommed together into a monolithic pipeline
* object.
* - Explicit, free-threaded command buffer generation.
* - Support for multiple, asynchronous engines for executing GPU work (graphics, compute, DMA).
* - Explicit system and GPU memory management.
* - Flexible shader resource binding model.
* - Explicit management of stalls, cache flushes, and compression state changes.
*
* However, as a common component supporting multiple APIs, the PAL interface tends to be lower level in places where
* client APIs diverge.
*
* ### Settings
* The PAL library has a number of configuration settings available for the client to modify either programmatically
* or via external settings. PAL also includes infrastructure for building/loading client-specific settings.
* See @ref Settings for a detailed description of this support.
*
* ### Initialization
* The first step to interacting with the PAL core is creating an IPlatform object and enumerating IDevice objects
* representing GPUs attached to the system and, optionally, IScreen objects representing displays attached to the
* system. See @ref LibInit for a detailed description.
*
* ### System Memory Allocation
* Clients have a lot of control over PAL's system memory allocations. Most PAL objects require the client to provide
* system memory; the client first calls a GetSize() method and then passes a pointer to PAL on the actual create call.
* Further, when PAL needs to make an internal allocation, it will optionally call a client callback, which can be
* specified on platform creation. This callback will specify a category for the allocation, which may imply an
* expected lifetime.
*
* ### Interface Classes
* The following diagram illustrates the relationship of some key PAL interfaces and how they interact to render a
* typical frame in a modern game. Below that is a listing of all of PAL's interface classes, and a very brief
* description of their purpose. Follow the link for each interface to see detailed reference documentation.
*
* @image html scheduling.png
*
* - __OS Abstractions__
* + _IPlatform_: Root-level object created by clients that interact with PAL. Mostly responsible for enumerating
* devices and screens attached to the system and returning any system-wide properties.<br><br>
* + _IDevice_: Configurable context for querying properties of a particular GPU and interacting with it. Acts as a
* factory for almost all other PAL objects.<br><br>
* + _IQueue_: A device has one or more _engines_ which are able to issue certain types of work. Tahiti, for example,
* has 1 universal engine (supports graphics, compute, or copy commands), 2 compute engines (support
* compute or copy commands), and 2 DMA engines (support only copy commands). An IQueue object is a
* context for submitting work on a particular engine. This mainly takes the form of submitting command
* buffers and presenting images to the screen. Work performed in a queue will be started in order, but
* work executed on different queues (even if the queues reference the same engine) is not guaranteed
* to be ordered without explicit synchronization.<br><br>
* + _IQueueSemaphore_: Queue semaphores can be signaled and waited on from an IQueue in order to control execution
* order between queues.<br><br>
* + _IFence_: Used for coarse-grain CPU/GPU synchronization. Fences can be signalled from the GPU as part of a
* command buffer submission on a queue, then waited on from the CPU.<br><br>
* + _IGpuMemory_: Represents a GPU-accessible memory allocation. Can either be virtual (only VA allocation which
* must be explicitly mapped via an IQueue operation) or physical. Residency of physical allocations
* must be managed by the client either globally for a device (IDevice::AddGpuMemoryReferences) or by
* specifying allocations referenced by command buffers at submit.<br><br>
* + _ICmdAllocator_: GPU memory allocation pool used for backing an ICmdBuffer. The client is free to create one
* allocator per device, or one per thread to remove thread contention.<br><br>
* + _IScreen_: Represents a display attached to the system. Mostly used for managing full-screen flip
* presents.<br><br>
* + _IPrivateScreen_: Represents a display that is not otherwise visible to the OS, typically a VR head mounted
* display.<br><br>
* - __Hardware IP Abstractions__
* + __All IP__
* - _ICmdBuffer_: Clients build command buffers to execute the desired work on the GPU, and submit them on a
* corresponding queue. Different types of work can be executed depending on the _queueType_ of
* the command buffer (graphics work, compute work, DMA work).<br><br>
* - _IImage_: Images are a 1D, 2D, or 3D collection of pixels (i.e., _texture_) that can be accessed by the
* GPU in various ways: texture sampling, BLT source/destination, UAV, etc.<br><br>
* + __GFXIP-only__
* - _IShader_: Container for shader byte code used as an input to pipeline creation. No compilation occurs
* until an IPipeline is created. Currently, AMDIL is the only supported input language.<br><br>
* - _IPipeline_: Comprised of all shader stages (CS for compute, VS/HS/DS/GS/PS for graphics), resource mappings
* describing how user data entries are to be used by the shaders, and some other fixed-function
* state like depth/color formats, blend enable, MSAA enable, etc.<br><br>
* - _IColorTargetView_: IImage view allowing the image to be bound as a color target (i.e., RTV.).<br><br>
* - _IDepthStencilView_: IImage view allowing the image to be bound as a depth/stencil target (i.e., DSV).<br><br>
* - _IGpuEvent_: Used for fine-grained (intra-command buffer) synchronization between the CPU and GPU. GPU
* events can be set/reset from either the CPU or GPU and waited on from either.<br><br>
* - _IQueryPool_: Collection of query slots for tracking occlusion or pipeline stats query results.<br><br>
* - __Dynamic State Objects__: _IColorBlendState_, _IDepthStencilState_, _IMsaaState_, _IScissorState_,
* and _IViewportState_ define logical collections of related fixed function graphics
* state, similar to DX11.<br><br>
* - _IPerfExperiment_: Used for gathering performance counter and thread trace data.<br><br>
* - _IBorderColorPalette_: Provides a collection of indexable colors for use by samplers that clamp to an
* arbitrary border color.<br><br>
* - __Common Base Classes__
* + _IDestroyable_: Defines a _Destroy()_ method for the PAL interface. Calling _Destroy()_ will release any
* internally allocated resources for the object, but the client is still responsible for freeing
* the system memory provided for the object.<br><br>
* + _IGpuMemoryBindable_: Defines a set of methods for binding GPU memory to the object. Interfaces that inherit
* _IGpuMemoryBindable_ require GPU memory in order to be used by the GPU. The client
* must query the requirements (e.g., alignment, size, heaps) and allocate/bind GPU memory
* for the object. _IGpuMemoryBindable_ inherits from _IDestroyable_.<br><br>
*
* ### %Format Info
* Several helper methods are available for dealing with image formats in the @ref Formats namespace.
*
* ### Graphics/Compute Execution Model
* Most graphics/compute work is defined by first binding a set of states then issuing a draw or dispatch command to
* kick off the work. The complete set of graphics states available in PAL is illustrated below; compute is a subset
* of this that only includes the pipeline, user data entries, and border color palette.
*
* @image html stateBreakdown.jpg
*
* Most of these correspond directly to a PAL interface object above, and these items are bound by calling a
* corresponding _CmdBind...()_ method in the ICmdBuffer interface. The states marked in yellow and orange, however,
* are _immediate_ states for which there is no object, you just specify the required state values in the corresponding
* _CmdSet...()_ method in the ICmdBuffer interface.
*
* User data entries are the way that input resources are specified for the pipeline on an upcoming draw/dispatch. This
* mapping is complicated, and is described fully in @ref ResourceBinding.
*
* A final complication worth noting is that PAL provides no implicit surface synchronization. The client is
* responsible for explicitly inserting barriers to resolve data hazards, flush/invalidate caches, and ensure images
* are in the proper compression state. For more detail, see ICmdBuffer::CmdReleaseThenAcquire, CmdRelease, CmdAcquire,
* CmdReleaseEvent, CmdAcquireEvent and AcquireReleaseInfo.
*
***********************************************************************************************************************
*/
} // Pal
@@ -0,0 +1,204 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palCmdAllocator.h
* @brief Defines the Platform Abstraction Library (PAL) ICmdAllocator interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palDestroyable.h"
namespace Pal
{
// Forward declarations.
struct GpuMemSubAllocInfo;
class IGpuMemory;
/// Flags controlling the creation of ICmdAllocator objects.
union CmdAllocatorCreateFlags
{
struct
{
uint32 threadSafe : 1; ///< If set, the allocator will acquire a lock each time it is accessed;
/// otherwise it will not attempt to protect itself from multithreaded
/// access.
uint32 autoMemoryReuse : 1; ///< If set, the allocator will track when the GPU finishes accessing
/// each piece of command memory and attempt to reuse memory which the
/// GPU is done with before allocating more memory from the OS. If not
/// set, memory will only be recycled after a call to
/// @ref ICmdAllocator::Reset().
uint32 disableBusyChunkTracking : 1; ///< If set, the allocator will not do any GPU-side tracking of which
/// command chunks are still in use. It will be the client's (or the
/// application's) responsibility to guarantee that command chunks are
/// not returned to the allocator before the GPU has finished processing
/// them. Failure to guarantee this will result in undefined behavior.
/// This flag has no effect if @ref autoMemoryReuse is not set.
uint32 autoTrimMemory : 1; ///< If set the allocator will automatically trim down the allocations
/// (where all chunks are idle on the freeList). A minimum of
/// allocFreeThreshold allocations are kept for fast reuse.
uint32 reserved : 28; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
};
/// Different type of allocation data that an ICmdAllocator allocates and distributes to command buffers.
enum CmdAllocType : uint32
{
CommandDataAlloc = 0, ///< Data allocated is for executable commands.
EmbeddedDataAlloc, ///< Data allocated is for embedded data.
LargeEmbeddedDataAlloc, ///< Data allocated is for embedded data, allocation is >32kb
GpuScratchMemAlloc, ///< Data allocated is GPU-only accessible at command buffer execution-time. Possible
/// uses like GPU events.
CmdAllocatorTypeCount ///< Number of allocation types for ICmdAllocator's.
};
/// Specifies properties for creation of an ICmdAllocator object. Input structure to IDevice::CreateCmdAllocator().
struct CmdAllocatorCreateInfo
{
CmdAllocatorCreateFlags flags; ///< Flags controlling command allocator creation.
struct
{
GpuHeap allocHeap; ///< Preferred allocation heap. For @ref GpuScratchMemAlloc, this field is
/// ignored and the allocation will always be in GPU-invisible memory. For
/// all other allocation types, this must be CPU-mappable.
/// For best performance, command allocators that will be used by the
/// UVD engine should prefer the Local heap
gpusize allocSize; ///< Size, in bytes, of the GPU memory allocations this allocator will create.
/// It must be an integer multiple of suballocSize.
gpusize suballocSize; ///< Size, in bytes, of the chunks of GPU memory this allocator will give to
/// command buffers. It must be an integer multiple of 4096.
/// Must be greater than zero even if the client doesn't plan on using this
/// allocation type.
uint32 allocFreeThreshold; ///< Minimum count of free allocations that the allocator should keep around
/// for fast reuse. It is used when the autoTrimMemory flag is set.
} allocInfo[CmdAllocatorTypeCount]; ///< Information for each allocation type.
};
/// Output structure for QueryUtilizationInfo().
/// The CmdAllocator utilization data can be queried by PAL clients in order to decide whether to trim the allocations.
struct CmdAllocatorUtilizationInfo
{
uint32 numAllocations; ///< Number of allocations owned by the allocator.
uint32 numFreeChunks; ///< Number of chunks that are reset and not in use.
uint32 numBusyChunks; ///< Number of chunks that in use by the GPU.
uint32 numReuseChunks; ///< Number of chunks that have been 'returned' to the allocator for reuse.
};
/**
***********************************************************************************************************************
* @interface ICmdAllocator
* @brief Allocates and distributes GPU memory to command buffers on the client's behalf.
*
* All ICmdBuffer objects must be associated with an ICmdAllocator at creation. Command buffers may switch command
* allocators when ICmdBuffer::Reset() is called. The set of command buffers associated with a given command allocator
* will query that allocator for additional GPU memory as they are building commands.
*
* To protect against race conditions the client must ask for a thread safe command allocator unless its can guarantee
* that all command buffers associated with a given command allocator will be built, reset, and destroyed in a thread-
* safe manner. It is illegal to destroy a command allocator while it still has command buffers associated with it.
*
* @see IDevice::CreateCmdAllocator()
***********************************************************************************************************************
*/
class ICmdAllocator : public IDestroyable
{
public:
/// Explicitly resets a command allocator, marking all internal GPU memory allocations as unused.
///
/// The client is responsible for guaranteeing that all command buffers associated with this allocator have finished
/// GPU execution and have been explicitly reset before calling this function.
///
/// @param [in] freeMemory If the all GPU and CPU memory allocations should be returned to the OS.
///
/// @returns Success if the command allocator was successfully reset. Otherwise, one of the following errors may be
/// returned:
/// + ErrorUnknown if an internal PAL error occurs.
virtual Result Reset(bool freeMemory) = 0;
/// Explicitly trims a command allocator, deleting as many unused internal GPU memory allocations as possible.
///
/// @returns Success if the command allocator was successfully trimmed.
///
/// @param [in] allocTypeMask Gives control whether trimming will be applied for each CmdAllocType.
/// Use (1 << CmdAllocatorTypeCount) - 1 to apply trimming to all types.
/// When trimming only the embedded date use (1 << EmbeddedDataAlloc).
/// @param [in] dynamicThreshold Minimum count of free allocations that the allocator should keep around
virtual Result Trim(uint32 allocTypeMask, uint32 dynamicThreshold) = 0;
/// Query the numbers of allocations and chunks of the given CmdAllocator type.
/// This may help clients to decide whether they may apply trimming or not.
///
/// @returns Success if valid values can be reported.
///
/// @param [in] type CmdAllocType that is being queried
/// @param [out] pUtilizationInfo The allocation and chunk counts will be stored here.
virtual Result QueryUtilizationInfo(CmdAllocType type, CmdAllocatorUtilizationInfo* pUtilizationInfo) const = 0;
/// Returns the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @returns Pointer to client data.
void* GetClientData() const
{
return m_pClientData;
}
/// Sets the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @param [in] pClientData A pointer to arbitrary client data.
void SetClientData(
void* pClientData)
{
m_pClientData = pClientData;
}
protected:
/// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly
/// called the proper create method.
ICmdAllocator() : m_pClientData(nullptr) {}
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~ICmdAllocator() { }
private:
/// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
/// and set via SetClientData().
/// For non-top-layer objects, this will point to the layer above the current object.
void* m_pClientData;
};
} // Pal
Datei-Diff unterdrückt, da er zu groß ist Diff laden
@@ -0,0 +1,370 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palCmdTracking.h
* @brief Defines a number of support classes used for construction and storage of struct TrackedCmdLocation
* defined in trackedCmdLocation.h
*
* - struct TrackingEventInfo: A single from uint8 to name, used for logging
* - class TrackedCmdSupportBase A set of TrackingEventInfo, maintained outside of Pal
* - class TrackedCmdLocationArray The arrays for TrackedCmdLocation's used for reporting
* correlation data through ICmdBufferReporting::CorrelationReportOnSubmit
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palVector.h"
#include "trackedCmdLocation.h"
namespace Pal
{
// forward decl
class Platform;
namespace CmdDisassembly
{
// forward definition
class TrackedCmdLocationArray;
/**
************************************************************************************************************************
* @brief class TrackedCmdLocationRef
* A copyable reference to a member in a TrackedCmdLocationArray, invariant to that array be
* re-allocated.
*
* @detail Is simply a pointer to a TrackedCmdLocationArray, and an index in to that array
*
************************************************************************************************************************
*/
class TrackedCmdLocationRef
{
public:
TrackedCmdLocationRef()
: m_pSourceArray(nullptr),
m_index(0)
{
}
TrackedCmdLocationRef(
TrackedCmdLocationArray* pSourceArray,
Util::uint32 index)
: m_pSourceArray(pSourceArray),
m_index(index)
{
}
TrackedCmdLocationRef(
TrackedCmdLocationRef&& other) = default;
TrackedCmdLocationRef(
TrackedCmdLocationRef const& other) = default;
TrackedCmdLocationRef& operator=(
TrackedCmdLocationRef&& other) = default;
TrackedCmdLocationRef& operator=(
TrackedCmdLocationRef const& other) = default;
bool operator==(
TrackedCmdLocationRef const& other) const
{ return (this->m_pSourceArray == other.m_pSourceArray) && (this->m_index == other.m_index); }
bool operator!=(
TrackedCmdLocationRef const& other) const
{ return (this->m_pSourceArray != other.m_pSourceArray) || (this->m_index != other.m_index); }
TrackedCmdLocation* Use();
const TrackedCmdLocation* Get() const;
Util::uint32 GetIndex() const
{
return m_index;
}
/// Helper functions
///
/// Clears the TrackedCmdLocation referred to by this TrackedCmdLocationRef
///
/// @returns
/// Result::ErrorInvalidPointer if (IsValid() == false)
/// Result::Success if successful
Result Clear();
/// @returns
/// TrackedCmdLocationMode::Invalid if (IsValid() == false)
/// Get()->m_mode otherwise
TrackedCmdLocationMode GetMode() const;
/// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef
/// to mode TrackedCmdLocationMode::Before
///
/// @param [in] eventId Refers to an uint8 event that has a begin and/or an end associated with it
/// Most likely, a value registered to a TrackedCmdSupportBase
/// @param [in] beforePtr The end pointer for the cmdList being tracked before the event referred to by eventId
/// Only 48-bits of beforePtr are used
///
/// @returns
/// Result::ErrorInvalidPointer if (IsValid() == false)
/// Result::Success if successful
Result SetAsBefore(
uint8 eventId,
uint64 beforePtr);
/// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef
/// to mode TrackedCmdLocationMode::After
///
/// @param [in] eventId Refers to an uint8 event that has a begin and/or an end associated with it
/// Most likely, a value registered to a TrackedCmdSupportBase
/// @param [in] afterPtr The end pointer for the cmdList being tracked after the event referred to by eventId
/// Only 48-bits of afterPtr are used
///
/// @returns
/// Result::ErrorInvalidPointer if (IsValid() == false)
/// Result::Success if successful
Result SetAsAfter(
uint8 eventId,
uint64 afterPtr);
/// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef
/// to mode TrackedCmdLocationMode::Delta, with no begin or end (ie, no data can be written to
/// the cmdList being tracked "during" the event referred to be eventId
///
/// @param [in] eventId Refers to an uint8 event that does not have a begin and/or an end associated with it
/// Such as Pal::CmdDisassembly::TrackedCmdLocation::PostClientEvent
/// @param [in] ptr The end pointer for the cmdList being tracked after the event referred to by eventId
/// Only 48-bits of ptr are used
///
/// @returns
/// Result::ErrorInvalidPointer if (IsValid() == false)
/// Result::Success if successful
Result SetAsEmptyDelta(
uint8 eventId,
uint64 ptr);
/// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef
/// to mode TrackedCmdLocationMode::ClientId
///
/// @param [in] clientId A 61-bit bit value used by the client application to identify which cmdList is being
/// tracked
///
/// @returns
/// Result::ErrorInvalidPointer if (IsValid() == false)
/// Result::Success if successful
Result SetAsClientId(
uint64 clientId);
/// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef
/// to mode TrackedCmdLocationMode::ClientEventId
///
/// @param [in] clientEventId A 61-bit bit value used by the client application to identify
/// a client event relative to the current end position of the cmdList being tracked
///
/// @returns
/// Result::ErrorInvalidPointer if (IsValid() == false)
/// Result::Success if successful
Result SetAsClientEvent(
uint64 clientEventId);
/// @brief bool TrackedCmdLocation::TrySetAsDelta(uint64 afterPtr)
/// Will attempt to set this TrackedCmdLocation to type TrackedCmdLocationMode::Delta
///
/// @detail If GetMode() == TrackedCmdLocationMode::Before and afterPtr - m_correlateInternal.m_ptr is small
/// enough to be encoded in m_correlateInternal.m_deltaInDWords, the mode will be altered to
/// TrackedCmdLocationMode::Delta, with afterPtr - m_correlateInternal.m_ptr encoded in
/// m_correlateInternal.m_deltaInDWords.
/// If this attempt fails, the calling function should instead create a TrackedCmdLocationMode::After
/// TrackedCmdLocation
///
/// @param [in] afterPtr, the value a TrackedCmdLocationMode::After would have for m_correlateInternal.m_ptr
/// @return Result::Success if it was possible to set this TrackedCmdLocation to type
/// TrackedCmdLocationMode::Delta
/// Result::Unsupported if the conditions described above are not met.
Result TrySetAsDelta(
uint64 afterPtr);
private:
TrackedCmdLocationArray* m_pSourceArray;
Util::uint32 m_index;
Result SetMode(
TrackedCmdLocationMode mode);
};
/// @brief struct TrackingEventInfo
/// Essentially just a name, plus a boolean to indicate whether the name is valid / has been set
struct TrackingEventInfo
{
Util::StringView<char> name;
bool isValid;
TrackingEventInfo()
: isValid(false)
{}
};
/**
************************************************************************************************************************
* @brief class TrackedCmdSupportBase translates eventId's to strings for internal correlation events
*
* @detail For use in Pal::Queue when dumping to text files. Corresponds to
* TrackedCmdLocation::m_correlateInternal.m_event for the cases where TrackedCmdLocation::m_mode
* is not TrackedCmdLocationMode::ClientEvent
*
* The implementation for this is in whatever client of Pal that is creating the internal correlation events,
*
************************************************************************************************************************
*/
class TrackedCmdSupportBase
{
public:
virtual ~TrackedCmdSupportBase() = default;
void SetEventIdName(
uint8 eventId,
const char* name)
{
PAL_ASSERT(static_cast<uint32>(eventId) < NumUInt8Values);
m_allEventsMap[eventId].name = name;
m_allEventsMap[eventId].isValid = true;
}
TrackingEventInfo const& GetEventInfo(
uint8 eventId) const
{
PAL_ASSERT(static_cast<uint32>(eventId) < NumUInt8Values);
return m_allEventsMap[eventId];
}
protected:
static constexpr uint32 NumUInt8Values = UINT8_MAX + 1;
TrackingEventInfo m_allEventsMap[NumUInt8Values];
TrackedCmdSupportBase() = default;
};
/**
************************************************************************************************************************
* @brief class TrackedCmdLocationArray is simple a TrackedCmdLocationVec together with a clientId
* and some helpers. TrackedCmdLocationArray live on Pal::GfxCmdBuffer
*
* @detail Each Pal::GfxCmdBuffer has at most CmdDisassembly::MaxNumSubCmdBuffers TrackedCmdLocationArray's
* corresponding to Pal::GfxCmdBuffer::NumCmdStreams();
*
* The clientId used for TrackedCmdLocationArray::m_clientId, corresponds to the client Id used in
* TrackedCmdLocation::m_clientId.m_clientId
*
* For the moment, the underlying implementation used is
* Util::Vector<TrackedCmdLocation, DefaultCapacity, Pal::Platform>, but could be changed to use a Chunk
* scheme, especially as sizes of cmdLists can become very large.
* The only requirement to a change, is for TrackedCmdLocationRef continues to function as an accessor
*
* Note that the functions in TrackedCmdLocationArray are not designed for thread-safety, as they are
* issued from command-list-building functions that are, in their turn, not thread safe. Adding mutex
* behavior here would potentially hide issues relating to thread-safety.
*
************************************************************************************************************************
*/
class TrackedCmdLocationArray
{
public:
static constexpr uint32 DefaultCapacity = 1024;
static constexpr uint32 BadIndex = UINT32_MAX;
static constexpr uint64 InvalidClientId = UINT64_MAX;
typedef Util::Vector<TrackedCmdLocation, DefaultCapacity, Pal::Platform> TrackedCmdLocationVec;
static uint32 GetTrackedCmdLocationArraySizeInBytes()
{
return sizeof(TrackedCmdLocationArray);
}
static TrackedCmdLocationArray* CreateTrackedCmdLocationArray(
void* pMemory,
Pal::Platform* pPlatform);
void Reset()
{
m_lastLocation = TrackedCmdLocationRef(this, BadIndex);
m_clientId = InvalidClientId;
m_locations.Clear();
}
void Destroy();
uint64 GetClientId() const
{
return m_clientId;
}
Result SetClientId(
uint64 clientId);
Util::uint32 GetTotalSize() const
{
return m_locations.size();
}
const TrackedCmdLocationVec& GetLocationsVec() const
{
return m_locations;
}
TrackedCmdLocationVec& UseLocationsVec()
{
return m_locations;
}
Pal::Result MakeNext(
TrackedCmdLocationRef* pResult);
const TrackedCmdLocationRef GetLast() const
{
return m_lastLocation;
}
bool IsLast(
TrackedCmdLocationRef const& location) const
{
return location == m_lastLocation;
}
private:
TrackedCmdLocationVec m_locations;
Pal::Platform* m_pPlatform;
uint64 m_clientId;
TrackedCmdLocationRef m_lastLocation;
TrackedCmdLocationArray(
Pal::Platform* pPlatform);
~TrackedCmdLocationArray() = default;
};
} // namespace CmdDisassembly
} // namespace Pal
@@ -0,0 +1,70 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palDestroyable.h
* @brief Defines the Platform Abstraction Library (PAL) IDestroyable interface.
***********************************************************************************************************************
*/
#pragma once
namespace Pal
{
/**
***********************************************************************************************************************
* @interface IDestroyable
* @brief Interface inherited by objects that must be explicitly destroyed by the client.
*
* This includes all objects except:
*
* + @ref IColorTargetView, @ref IDepthStencilView - These classes are treated as SRDs by the DX12 runtime. Therefore,
* PAL guarantees that no action needs to be taken at Destroy() - the client should just free the memory backing these
* classes.
* + @ref IDevice - These objects are created during IPlatform::EnumerateDevices() and are automatically destroyed
* along with the Platform object.
* + @ref IPrivateScreen - These objects are created as during IPlatform::EnumerateDevices() based on
* which screens are attached to each device. They are automatically destroyed along with the Platform object.
***********************************************************************************************************************
*/
class IDestroyable
{
public:
/// Frees all resources associated with this object.
///
/// It is the client's responsibility to only call this method once there are no more existing references to this
/// object. This method does not free the system memory associated with the object (as specified in pPlacementAddr
/// during creation); the client is responsible for freeing that memory since they allocated it.
virtual void Destroy() = 0;
protected:
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~IDestroyable() { }
};
} // Pal
@@ -0,0 +1,626 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2016-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palDeveloperHooks.h
* @brief Common include for PAL developer callbacks. Defines common enums, typedefs, structures, etc.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palCmdBuffer.h"
namespace Pal
{
// Forward declarations.
class ICmdBuffer;
class IImage;
class IPipeline;
namespace Developer
{
/// The type of the developer callback so the callback can properly perform whatever actions it needs.
///
/// @see Callback
enum class CallbackType : uint32
{
AllocGpuMemory = 0, ///< This callback is to inform that GPU memory has been allocated.
FreeGpuMemory, ///< This callback is to inform that GPU memory has been freed.
PresentConcluded, ///< This callback is to inform that a present has concluded.
ImageBarrier, ///< This callback is to inform that a barrier is being executed.
CreateImage, ///< This callback is to inform that an image has been created.
BarrierBegin, ///< This callback is to inform that a barrier is about to be executed.
BarrierEnd, ///< This callback is to inform that a barrier is done being executed.
DrawDispatch, ///< This callback is to inform that a draw or dispatch command is being recorded.
BindPipeline, ///< This callback is to inform that a pipeline (client or internal) has been bound.
SurfRegData, ///< This callback is to inform tools of the register state of a surface.
#if PAL_DEVELOPER_BUILD
DrawDispatchValidation, ///< This callback is to describe the state validation needed by a draw or dispatch.
BindPipelineValidation, ///< This callback is to describe the state validation needed by a pipeline bind.
OptimizedRegisters, ///< This callback is to describe the PM4 optimizer's removal of redundant register
/// sets.
#endif
BindGpuMemory, ///< This callback is to inform of a new binding to GPU memory.
SubAllocGpuMemory, ///< This callback is to inform of suballocation from base GPU memory allocation.
SubFreeGpuMemory, ///< This callback is to inform that GPU memory suballocation has been freed.
#if PAL_DEVELOPER_BUILD
RpmBlt, ///< This callback is to describe the internal RPM blt calls.
#endif
Count, ///< The number of info types.
};
constexpr uint32 AllCallbackTypesMask = Util::BitfieldGenMask(static_cast<uint32>(CallbackType::Count));
constexpr uint32 DefaultDisabledCallbackTypes = (1 << static_cast<uint32>(CallbackType::BindGpuMemory)) |
(1 << static_cast<uint32>(CallbackType::SubAllocGpuMemory)) |
(1 << static_cast<uint32>(CallbackType::SubFreeGpuMemory));
constexpr uint32 DefaultEnabledCallbackTypes = AllCallbackTypesMask & ~DefaultDisabledCallbackTypes;
/// Definition for developer callback.
///
/// @param [in] pPrivateData Private data that is installed with the callback for use by the installer.
/// @param [in] deviceIndex Unique index for the device so that the installer can properly dispatch the event.
/// @param [in] infoType Information about the callback so the installer can make informed decisions about
/// what actions to perform.
/// @param [in] pInfoData Additional data related to the particular callback type.
typedef void (PAL_STDCALL *Callback)(
void* pPrivateData,
const uint32 deviceIndex,
CallbackType type,
void* pCbData);
/// Enumeration describing the different ways GPU memory is allocated.
enum class GpuMemoryAllocationMethod : uint32
{
Unassigned = 0, ///< Unassigned allocation method.
Normal, ///< Virtual memory allocation (not pinned/peer).
Pinned, ///< Pinned memory allocation.
Peer, ///< Peer memory allocation.
MultiDevice, ///< MultiDevice memory allocation.
Opened, ///< Shared memory allocation.
Svm, ///< Shared virtual memory allocation.
};
/// Enumeration describing the different Presentation modes an application can take.
enum class PresentModeType : uint32
{
Unknown = 0, ///< When the present mode is not known.
Flip, ///< when the presentation surface is used directly as the front buffer.
Composite, ///< When the flipped image is drawn by a window compositor instead
/// of the application.
Blit, ///< when the presentation surface is copied to the front buffer.
};
/// Information about the presentation mode an application is in.
struct PresentationModeData
{
PresentModeType presentationMode; ///< Information about present mode from above enumeration.
UniquePresentKey presentKey; ///< Identifies the window/swap chain, etc. used to present.
};
/// Information for allocation/deallocation of GPU memory.
struct GpuMemoryData
{
gpusize size; ///< Size, in bytes, of the allocation.
GpuHeap heap; ///< The first requested heap of the allocation.
/// Allocation description flags
struct Flags
{
uint32 isClient : 1; ///< This allocation is requested by the client.
uint32 isFlippable : 1; ///< This allocation is marked as flippable.
uint32 isUdmaBuffer : 1; ///< This allocation is for a UDMA buffer.
uint32 isVirtual : 1; ///< This allocation is for virtual memory.
uint32 isCmdAllocator : 1; ///< This allocation is for a CmdAllocator.
uint32 isExternal : 1; ///< This allocation is marked as external.
uint32 buddyAllocated : 1; ///< This allocation is buddy allocated.
uint32 appRequested : 1; ///< This allocation is Pal internal, but application requested
uint32 reserved : 24; ///< Reserved for future use.
} flags; ///< Flags describing the allocation.
GpuMemoryAllocationMethod allocMethod; ///< Allocation method
const IGpuMemory* pGpuMemory; ///< Handle to the Pal::IGpuMemory object of this GPU memory allocation
gpusize offset; ///< Offset, in bytes, of a suballocation within a base allocation. For
/// base allocations, offset is always zero.
};
#if PAL_DEVELOPER_BUILD
/// PWS acquire point for barrier logger
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 901
enum AcquirePoint : uint8
{
AcquirePointPfp,
AcquirePointMe,
AcquirePointPreShader,
AcquirePointPreDepth,
AcquirePointPrePs,
AcquirePointPreColor,
AcquirePointEop,
AcquirePointCount
};
#else
enum class AcquirePoint : uint8
{
Pfp = 0,
Me,
PreShader,
PreDepth,
PrePs,
PreColor,
Eop, // Invalid, for internal optimization purpose.
Count
};
#endif
#endif
/// Information pertaining to the cache flush/invalidations and stalls performed during barrier execution.
struct BarrierOperations
{
union
{
struct
{
uint16 eopTsBottomOfPipe : 1; ///< Issue an end-of-pipe event that can be waited on.
/// When combined with waitOnTs, makes a full pipeline stall.
uint16 vsPartialFlush : 1; ///< Stall at ME, waiting for all prior VS waves to complete.
uint16 psPartialFlush : 1; ///< Stall at ME, waiting for all prior PS waves to complete.
uint16 csPartialFlush : 1; ///< Stall at ME, waiting for all prior CS waves to complete.
uint16 pfpSyncMe : 1; ///< Stall PFP until ME is at same point in command stream.
/// flushed/invalidated are specified in the caches bitfield.
uint16 syncCpDma : 1; ///< Issue dummy cpDma command to confirm all prior cpDmas have
/// completed.
uint16 eosTsPsDone : 1; ///< Issue an end-of-pixel-shader event that can be waited on.
uint16 eosTsCsDone : 1; ///< Issue an end-of-compute-shader event that can be waited on
uint16 waitOnTs : 1; ///< Wait on an timestamp event (EOP or EOS) at the ME.
/// Which event is not necesarily specified here, though any
/// that are specified here would be waited on.
uint16 reserved : 7; ///< Reserved for future use.
};
uint16 u16All; ///< Unsigned integer containing all the values.
} pipelineStalls; ///< Information about pipeline stalls performed.
union
{
struct
{
uint16 depthStencilExpand : 1; ///< Decompression of depth/stencil image.
uint16 htileHiZRangeExpand : 1; ///< Expansion of HTile's HiZ range.
uint16 depthStencilResummarize : 1; ///< Resummarization of depth stencil.
uint16 dccDecompress : 1; ///< DCC decompress BLT for color images.
uint16 fmaskDecompress : 1; ///< Fmask decompression for shader readability.
uint16 fastClearEliminate : 1; ///< Expand latest specified clear color into pixel data for the fast
/// cleared color/depth resource.
uint16 fmaskColorExpand : 1; ///< Completely decompresses the specified color resource.
uint16 initMaskRam : 1; ///< Memsets uninitialized memory to prepare it for use as
/// CMask/FMask/DCC/HTile.
uint16 updateDccStateMetadata : 1; ///< DCC state metadata was updated.
uint16 reserved : 7; ///< Reserved for future use.
};
uint16 u16All; ///< Unsigned integer containing all the values.
} layoutTransitions; ///< Information about layout translation performed.
union
{
struct
{
uint16 invalTcp : 1; ///< Invalidate vector caches.
uint16 invalSqI$ : 1; ///< Invalidate the SQ instruction caches.
uint16 invalSqK$ : 1; ///< Invalidate the SQ constant caches (scalar caches).
uint16 flushTcc : 1; ///< Flush L2 cache.
uint16 invalTcc : 1; ///< Invalidate L2 cache.
uint16 flushCb : 1; ///< Flush CB caches.
uint16 invalCb : 1; ///< Invalidate CB caches.
uint16 flushDb : 1; ///< Flush DB caches.
uint16 invalDb : 1; ///< Invalidate DB caches.
uint16 invalCbMetadata : 1; ///< Invalidate CB meta-data cache.
uint16 flushCbMetadata : 1; ///< Flush CB meta-data cache.
uint16 invalDbMetadata : 1; ///< Invalidate DB meta-data cache.
uint16 flushDbMetadata : 1; ///< Flush DB meta-data cache.
uint16 invalTccMetadata : 1; ///< Invalidate L2 meta-data cache (also called the GLM).
uint16 invalGl1 : 1; ///< Invalidate the global L1 cache
uint16 placeholder : 1; ///< Reserved for future use.
};
uint16 u16All; ///< Unsigned integer containing all the values.
} caches; ///< Information about cache operations performed for the barrier.
#if PAL_DEVELOPER_BUILD
AcquirePoint acquirePoint;
#endif
};
/// Enumeration for PAL barrier reasons
enum BarrierReason : uint32
{
BarrierReasonInvalid = 0, ///< Invalid barrier reason
BarrierReasonFirst = 0x80000000, ///< The first valid barrier reason value
/// The only value that can smaller than this is the
/// invalid value.
BarrierReasonLast = 0xbfffffff, ///< The last valid barrier reason value
/// The only value that can larger than this is the
/// unknown value.
BarrierReasonPreComputeColorClear = BarrierReasonFirst, ///< Barrier issued before a color clear
BarrierReasonPostComputeColorClear, ///< Barrier issued after a color clear
BarrierReasonPreComputeDepthStencilClear, ///< Barrier issued before a depth/stencil clear
BarrierReasonPostComputeDepthStencilClear, ///< Barrier issued after a depth/stencil clear
BarrierReasonMlaaResolveEdgeSync, ///< Barrier issued to sync mlaa edge calculations
BarrierReasonAqlWaitForParentKernel, ///< Barrier issued to wait for the parent kernel to
/// complete in an AQL submission
BarrierReasonAqlWaitForChildrenKernels, ///< Barrier issued to wait for the children kernels to
/// complete in an AQL submission
BarrierReasonP2PBlitSync, ///< Barrier issued to synchronize peer-to-peer blits
BarrierReasonTimeGraphGrid, ///< Barrier issued to wait for the time graph grid
BarrierReasonTimeGraphGpuLine, ///< Barrier issued to wait for the time graph gpu line
BarrierReasonDebugOverlayText, ///< Barrier issued to wait for the debug overlay text
BarrierReasonDebugOverlayGraph, ///< Barrier issued to wait for the debug overlay graph
BarrierReasonDevDriverOverlay, ///< Barrier issued to wait for developer driver overlay
BarrierReasonDmaImgScanlineCopySync, ///< Barrier issued to synchronize between image scanline
/// copies on the dma hardware
BarrierReasonPostSqttTrace, ///< Barrier issued to wait for work from an sqtt trace
BarrierReasonPrePerfDataCopy, ///< Barrier issued to wait for perf data to become
/// available for copy
BarrierReasonFlushL2CachedData, ///< Barrier issued to flush L2 cached data to main memory
BarrierReasonResolveImage, ///< Barrier issued before and after resolve image shader
BarrierReasonPerPixelCopy, ///< Barrier issued between CS copy and per-pixel copy steps
BarrierReasonGenerateMipmaps, ///< Barrier issued between generating mip levels
/// Newly defined barrier reasons should be before this one.
BarrierReasonInternalLastDefined, ///< Only used for asserts.
BarrierReasonUnknown = 0xFFFFFFFF, ///< Unknown barrier reason
/// Backwards compatibility reasons
BarrierReasonPreSyncClear = BarrierReasonPreComputeColorClear,
BarrierReasonPostSyncClear = BarrierReasonPostComputeColorClear
};
/// Style of barrier
enum class BarrierType : uint32
{
Full = 0, ///< A traditional blocking barrier.
Release, ///< A pipelined barrier that flushes caches and starts transitions.
Acquire, ///< A barrier that waits on previous 'Release' barriers.
Count
};
/// Information for barrier executions.
struct BarrierData
{
ICmdBuffer* pCmdBuffer; ///< The command buffer that is executing the barrier.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 902
ImgBarrier transition; ///< The particular image barrier with layout transition blt that is currently
/// executing, only used during a CallbackType::ImageBarrier.
#else
BarrierTransition transition; ///< The particular transition with layout transition blt that is currently
/// executing, only used during a CallbackType::ImageBarrier.
#endif
bool hasTransition; ///< Whether or not the transition structure is populated.
BarrierOperations operations; ///< Detailed cache and pipeline operations performed during this barrier execution
uint32 reason; ///< Reason that the barrier was invoked. Only filled at BarrierBegin.
BarrierType type; ///< What style of barrier this is. Only filled at BarrierBegin.
};
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888
/// Enumeration describing the different types of tile mode dimensions
enum class Gfx6ImageTileModeDimension : uint32
{
Linear = 0, ///< Linear tile mode.
Dim1d, ///< 1D tile mode.
Dim2d, ///< 2D tile mode.
Dim3d, ///< 3D tile mode.
};
/// Tile mode information
struct Gfx6ImageTileMode
{
Gfx6ImageTileModeDimension dimension; ///< Dimensionality of tile mode.
union
{
struct
{
uint32 prt : 1; ///< Image is a PRT.
uint32 thin : 1; ///< Thin tiled.
uint32 thick : 1; ///< Thick tiled.
uint32 reserved : 29; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} properties; ///< Bitfield of properties
};
/// Enumeration describing the different tile types
enum class Gfx6ImageTileType : uint32
{
Displayable = 0, ///< Displayable tiling.
NonDisplayable, ///< Non-displayable tiling.
DepthSampleOrder, ///< Same as non-displayable plus depth-sample-order.
Rotated, ///< Rotated displayable tiling.
Thick, ///< Thick micro-tiling.
};
#endif
/// Meta-data-related properties
struct ImageMetaDataInfo
{
union
{
struct
{
uint32 color : 1; ///< Flag indicates this is a color buffer.
uint32 depth : 1; ///< Flag indicates this is a depth/stencil buffer.
uint32 stencil : 1; ///< Flag indicates this is a stencil buffer.
uint32 texture : 1; ///< Flag indicates this is a texture.
uint32 cube : 1; ///< Flag indicates this is a cubemap.
uint32 volume : 1; ///< Flag indicates this is a volume texture.
uint32 fmask : 1; ///< Flag indicates this is an fmask.
uint32 compressZ : 1; ///< Flag indicates z buffer is compressed.
uint32 overlay : 1; ///< Flag indicates this is an overlay surface.
uint32 noStencil : 1; ///< Flag indicates this depth has no separate stencil.
uint32 display : 1; ///< Flag indicates this should match display controller req.
uint32 opt4Space : 1; ///< Flag indicates this surface should be optimized for space
/// i.e. save some memory but may lose performance.
uint32 prt : 1; ///< Flag for partially resident texture.
uint32 tcCompatible : 1; ///< Image's metadata is TC-compatible. This reduces the maximum
/// compression levels, but allows the shader to read the data without
/// an expensive decompress operation.
uint32 dccCompatible : 1; ///< GFX 8: whether to make MSAA surface support dcc fast clear.
uint32 dccPipeWorkaround : 1; ///< GFX 8: whether to workaround the HW limit that
/// dcc can't be enabled if pipe config of tile mode
/// is different from that of ASIC.
uint32 disableLinearOpt : 1; ///< Disable tile mode optimization to linear.
uint32 reserved : 15; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} properties; ///< Bitfield of properties
};
/// Information for allocation of a PAL Image - AddrLib surface info.
struct ImageDataAddrMgrSurfInfo
{
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888
union
{
struct
{
Gfx6ImageTileMode mode; ///< Tile mode.
Gfx6ImageTileType type; ///< Micro tiling type.
} gfx6;
struct
{
uint32 swizzle; ///< Swizzle mode.
} gfx9;
} tiling;
#endif
ImageMetaDataInfo flags; ///< Metadata info.
uint32 swizzle; ///< HW-specific swizzle mode.
uint64 size; ///< Surface size, in bytes.
uint32 bpp; ///< Bits per pixel.
uint32 width; ///< Width.
uint32 height; ///< Height.
uint32 depth; ///< Depth.
};
/// Type of surface for which the register data is being provided
enum class SurfRegDataType : uint32
{
RenderTargetView, ///< Render Target View information.
};
/// Information for surface addresses for a SurfRegData callback
struct SurfRegDataInfo
{
SurfRegDataType type; ///< Type of surface to which the register data corresponds.
uint32 regData; ///< Hardware-specific register data for the specific surface type.
};
/// Type of draw or dispatch operation for a DrawDispatch callback
enum class DrawDispatchType : uint32
{
CmdDraw = 0, ///< Auto-indexed draw.
CmdDrawOpaque, ///< Auto draw.
CmdDrawIndexed, ///< Indexed draw.
CmdDrawIndirectMulti, ///< (Multi) indirect draw.
CmdDrawIndexedIndirectMulti, ///< (Multi) indirect indexed draw.
CmdDispatchMesh, ///< Task/Mesh shader dispatch.
CmdDispatchMeshIndirectMulti, ///< Indirect Task/Mesh shader dispatch.
CmdGenExecuteIndirectDraw, ///< ExecuteIndirect draw.
CmdGenExecuteIndirectDrawIndexed, ///< ExecuteIndirect indexed draw.
CmdGenExecuteIndirectDispatchMesh, ///< ExecuteIndirect Task/Mesh shader dispatch.
CmdDispatch, ///< Direct compute dispatch.
CmdDispatchAce, ///< Direct Compute dispatch through implicit ganged-submit ACE stream.
CmdDispatchIndirect, ///< Indirect compute dispatch.
CmdDispatchOffset, ///< Direct compute dispatch (offsetted start).
CmdGenExecuteIndirectDispatch, ///< ExecuteIndirect dispatch.
CmdDispatchAql, ///< AQL compute dispatch
Count,
FirstDispatch = CmdDispatch ///< All callbacks with an enum value greater or equal than this are dispatches
};
/// Draw-specific information for DrawDispatch callbacks
struct DrawDispatchDrawArgs
{
/// Contains information about user data register indices for certain draw parameter state.
/// Some of these values may not be available for all draws on all clients, and in such
/// cases the value will be UINT_MAX.
struct
{
uint32 firstVertex; ///< Vertex offset (first vertex) user data register index
uint32 instanceOffset; ///< Instance offset (start instance) user data register index
uint32 drawIndex; ///< Draw ID SPI user data register index
} userDataRegs;
};
/// Dispatch-specific information for DrawDispatch callbacks
struct DrawDispatchDispatchArgs
{
DispatchDims groupStart; ///< Thread/workgroup start offsets in X/Y/Z dimensions. Only valid for CmdDispatchOffset.
DispatchDims groupDims; ///< Thread/workgroup counts in X/Y/Z dimensions. Only valid for CmdDispatch[Offset].
DispatchDims logicalSize; ///< Thread/workgroup counts as seen by the shader. Only valid for CmdDispatchOffset.
/// Optional flags to help the client driver understand the dispatch.
/// For example, if the dispatch originated in PAL rather than the client driver.
DispatchInfoFlags infoFlags;
};
/// Information for DrawDispatch callbacks
struct DrawDispatchData
{
ICmdBuffer* pCmdBuffer; ///< The command buffer that is recording this command
DrawDispatchType cmdType; ///< Draw/dispatch command type. This influences which sub-structure below is valid.
union
{
/// Draw-specific parameters. Valid when cmdType is CmdDraw*.
DrawDispatchDrawArgs draw;
/// Dispatch-specific parameters. Valid when cmdType is CmdDispatch*
DrawDispatchDispatchArgs dispatch;
};
/// If the handler of this callback inserts an RGP trace marker using ICmdBuffer::CmdInsertRgpTraceMarker(),
/// these flags should be passed to that call to control which sub-queue(s) in the command buffer should insert
/// the marker.
RgpMarkerSubQueueFlags subQueueFlags;
};
/// Information for BindPipeline callbacks
struct BindPipelineData
{
const IPipeline* pPipeline; ///< The currently-bound pipeline
const PipelineInfo* pPipelineInfo; ///< General information about the bound pipeline
ICmdBuffer* pCmdBuffer; ///< The command buffer that is recording this command
uint64 apiPsoHash; ///< The hash to correlate APIs and corresponding PSOs.
PipelineBindPoint bindPoint; ///< The bind point of the pipeline within a queue.
/// If the handler of this callback inserts an RGP trace marker using ICmdBuffer::CmdInsertRgpTraceMarker(),
/// these flags should be passed to that call to control which sub-queue(s) in the command buffer should insert
/// the marker.
RgpMarkerSubQueueFlags subQueueFlags;
};
#if PAL_DEVELOPER_BUILD
/// Information for DrawDispatchValidation callbacks
struct DrawDispatchValidationData
{
ICmdBuffer* pCmdBuffer; ///< The command buffer which is recording the triggering draw or dispatch.
uint32 userDataCmdSize; ///< Size of PM4 commands used to validate the current user-data entries (bytes).
uint32 miscCmdSize; ///< Size of PM4 commands for all other draw- or dispatch-time validation (bytes).
};
// Information for BindPipelineValidation callbacks
struct BindPipelineValidationData
{
ICmdBuffer* pCmdBuffer; ///< The command buffer which is recording the triggering draw or dispatch.
uint32 pipelineCmdSize; ///< Size of PM4 commands used to validate the current pipeline state (bytes).
};
/// Information for OptimizedRegisters callbacks
struct OptimizedRegistersData
{
ICmdBuffer* pCmdBuffer; ///< The command buffer which is recording the triggering PM4 stream.
/// Array containing the number of times the PM4 optimizer saw a SET packet which modified each register
const uint32* pShRegSeenSets;
///< Array containing the number of times the PM4 optimizer kept a SET packet which modified each register
const uint32* pShRegKeptSets;
uint32 shRegCount; ///< Number of SH registers
uint16 shRegBase; ///< Base address of SH registers
/// Array containing the number of times the PM4 optimizer saw a SET or RMW packet which modified each register
const uint32* pCtxRegSeenSets;
///< Array containing the number of times the PM4 optimizer kept a SET or RMW packet which modified each register
const uint32* pCtxRegKeptSets;
uint32 ctxRegCount; ///< Number of context registers
uint16 ctxRegBase; ///< Base address of context registers
};
/// Internal RPM blt type
enum class RpmBltType : uint32
{
CpDmaCopy = 0,
CpDmaUpdate,
Draw,
Dispatch,
Count
};
/// Describes the RPM blt call
struct RpmBltData
{
ICmdBuffer* pCmdBuffer; ///< The command buffer that is executing the blt.
RpmBltType bltType; ///< Type of RPM blt, @ref RpmBltType.
};
#endif
/// Describes the binding of a GPU Memory object to a resource
struct BindGpuMemoryData
{
const void* pObj; ///< Opaque pointer to the resource having memory bound to it.
gpusize requiredGpuMemSize; ///< GPU memory size required by pObj.
const IGpuMemory* pGpuMemory; ///< IGpuMemory object being bound to the resource.
gpusize offset; ///< Offset within pGpuMemory where the resource is being bound.
bool isSystemMemory; ///< If true then system memory is being bound to the object. In this case,
/// pGpuMemory and offset should be set to zero.
};
/// Describes an user marker operation
enum class UserMarkerOpType : uint8
{
Invalid = 0, ///< Invalid user marker operation
Push, ///< Push user marker operation
Pop, ///< Pop user marker operation
Set ///< Set user marker operation
};
/// Describes an user marker operation, used in UserMarkerHistoryTraceSource
struct UserMarkerOpInfo
{
union
{
struct
{
uint32 opType : 2; ///< UserMarkerOpType
uint32 strIndex : 30; ///< Index of the user marker in the in corresponding string table
};
uint32 u32All;
};
};
} // Developer
} // Pal
Datei-Diff unterdrückt, da er zu groß ist Diff laden
@@ -0,0 +1,171 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palFence.h
* @brief Defines the Platform Abstraction Library (PAL) IFence interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palDestroyable.h"
namespace Pal
{
/// Specifies properties for fence @ref IFence fence creation. Input structure to IDevice::CreateFence().
struct FenceCreateInfo
{
union
{
struct
{
uint32 signaled : 1; ///< Specify whether the initial status of the fence is signaled or not.
uint32 eventCanBeInherited : 1; ///< The event handle can be inherited by child process.
uint32 shareable : 1; ///< This fence may be opened for use by a different device.
uint32 reserved : 29; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< Fence creation flags.
#if defined(_WIN32)
const wchar_t* pName; /// The name of the event object, Windows uses this name to uniquely identify fence objects
/// across processes.
#endif
};
/// Specifies properties for fence opening. Input structure to IDevice::OpenFence().
struct FenceOpenInfo
{
union
{
struct
{
uint32 isReference : 1; ///< If set, then the opened fence will reference the same sync object
///< in the kernel. Otherwise, the object is copied to the new Fence.
uint32 reserved : 31; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags;
OsExternalHandle externalFence; ///< External shared fence handle.
#if defined(_WIN32)
const wchar_t* pName; /// The name of the event object,Windows uses this name to uniquely identify
/// fence objects across processes.
#endif
};
/// Specifies properties for fence exporting. Input structure to IFence::ExportExternalHandle().
struct FenceExportInfo
{
union
{
struct
{
uint32 isReference : 1; ///< If set, then the fence exporting a handle that reference the same sync
///< object in the kernel. Otherwise, the object is copied to the new Fence.
uint32 implicitReset : 1; ///< If set, a fence reset will be done for the sync fd exported.
uint32 reserved : 30; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags;
};
/**
***********************************************************************************************************************
* @interface IFence
* @brief Represents a command buffer fence the client can use for coarse-level synchronization between the GPU and
* CPU.
*
* Fences can be specified when calling IQueue::Submit() and will be signaled when certain prior queue operations have
* completed. The status of the fence can be queried by the client to determine when the GPU work of interest has
* completed.
*
* Fences are guaranteed to wait for:
* + Prior command buffer submissions.
* + Prior queue semaphore signals and waits.
* + Prior direct presents.
*
* @see IDevice::CreateFence()
***********************************************************************************************************************
*/
class IFence : public IDestroyable
{
public:
/// Gets the status (completed or not) of the fence.
///
/// @returns Success if the fence has been reached, or NotReady if the fence hasn't been reached. Other return
/// codes indicate an error:
/// + ErrorFenceNeverSubmitted if the fence hasn't been submitted yet and the fence is not created with
/// initialSignaled set to true.
virtual Result GetStatus() const = 0;
/// Export the event handle or sync object handle of the fence for external usage.
/// If @ref FenceExportInfo::isReference is not set, then this also performs an implicit reset operation on
/// the Fence.
///
/// @param [in] exportInfo Information describing how the Fence handle should be exported.
/// @returns the handle in the type OsExternalHandle
virtual OsExternalHandle ExportExternalHandle(
const FenceExportInfo& exportInfo) const = 0;
/// Returns the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @returns Pointer to client data.
void* GetClientData() const
{
return m_pClientData;
}
/// Sets the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @param [in] pClientData A pointer to arbitrary client data.
void SetClientData(
void* pClientData)
{
m_pClientData = pClientData;
}
protected:
/// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly
/// called the proper create method.
IFence() : m_pClientData(nullptr) {}
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~IFence() { }
private:
/// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
/// and set via SetClientData().
/// For non-top-layer objects, this will point to the layer above the current object.
void* m_pClientData;
};
} // Pal
@@ -0,0 +1,506 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palFormat.h
* @brief Common include for the Platform Abstraction Library (PAL) interface. Defines format types.
***********************************************************************************************************************
*/
#pragma once
#include "palUtil.h"
/// Library-wide namespace encapsulating all PAL entities.
namespace Pal
{
/// Specifies the format for an image or GPU memory view.
///
/// This defines the bit layout of the channels and how the value in each channel is interpreted.
///
/// Channels are listed in DX10+-style order, where the least significant channels are listed first. For example, for
/// a uint32 val with an X8Y8Z8W8 value: X = val & 0xFF, Y = (val >> 8) & 0xFF, Z = (val >> 16) & 0xFF,
/// W = (val >> 24) & 0xFF. Enums without a more detailed explanation can be decoded in this manner. Multimedia,
/// or "YUV" formats are all exceptions to this rule. Each of those formats explicitly describes how their channels are
/// organized.
///
/// Many of the multimedia (i.e., "YUV") formats are encoded such that the chrominance (chroma, CbCr, UV) samples are
/// stored at a lower resolution than the luminance (luma, Y) samples as a form of compression. The ratio of the
/// subsampling is often referred to using an A:B:C notation, where the trio of numbers A,B,C are used to describe the
/// number of luma and chroma samples in a hypothetical region which is A pixels wide and 2 pixels high. The three
/// numbers usually refer to the following quantities by convention:
/// A --> Width of the conceptual region of pixels, and is usually 4.
/// B --> Number of chroma samples in the first of two rows of A pixels.
/// C --> Number of changes of chroma samples between the first and second rows of A pixels.
///
/// Common examples of this notation are described below:
/// 4:4:4 --> No chroma subsampling because luma and chroma both have 4 samples per row, and the number of chroma and
/// luma rows is the same.
/// 4:1:1 --> Each row of 4 pixels has 1 chroma sample, and the number of chroma and luma rows is the same.
/// 4:2:0 --> Each row of 4 pixels has 2 chroma samples, and there is only 1 chroma row for every 2 luma rows.
/// 4:2:2 --> Each row of 4 pixels has 2 chroma samples, and the number of chroma and luma rows is the same.
///
/// Because of the subsampling ratios for multimedia formats, there are some restrictions on what dimensions can be used
/// when creating Images of these formats. 4:1:1 formats must have widths specified as a multiple of 4. 4:2:0 formats
/// must have widths and heights specified as multiples of 2. 4:2:2 formats must have widths specified as a multiple of
/// 2. 4:4:4 formats have no dimensional restrictions.
///
/// Additionally, the YUV formats are broadly grouped into two categories: packed and planar formats. Packed formats
/// interleave the luma and chroma samples in each row of pixels. Planar formats are organized so that all of the luma
/// samples are together, followed by all of the chroma samples. Some planar formats interleave the U and V chroma
/// data, while some choose to have separate U and V planes. Both packed and planar formats can have any subsampling
/// ratio between the luma and chroma data.
enum class ChNumFormat : Util::uint32
{
Undefined = 0x0, ///< Used in situations where no format is needed, like raw memory views, or to
/// indicate no color/depth target will be attached when creating a graphics
/// pipeline.
X1_Unorm = 0x1, ///< _Untested._
X1_Uscaled = 0x2, ///< _Untested._
X4Y4_Unorm = 0x3,
X4Y4_Uscaled = 0x4,
L4A4_Unorm = 0x5,
X4Y4Z4W4_Unorm = 0x6,
X4Y4Z4W4_Uscaled = 0x7,
X5Y6Z5_Unorm = 0x8,
X5Y6Z5_Uscaled = 0x9,
X5Y5Z5W1_Unorm = 0xA,
X5Y5Z5W1_Uscaled = 0xB,
X1Y5Z5W5_Unorm = 0xC,
X1Y5Z5W5_Uscaled = 0xD,
X8_Unorm = 0xE,
X8_Snorm = 0xF,
X8_Uscaled = 0x10,
X8_Sscaled = 0x11,
X8_Uint = 0x12,
X8_Sint = 0x13,
X8_Srgb = 0x14,
A8_Unorm = 0x15,
L8_Unorm = 0x16,
P8_Unorm = 0x17,
X8Y8_Unorm = 0x18,
X8Y8_Snorm = 0x19,
X8Y8_Uscaled = 0x1A,
X8Y8_Sscaled = 0x1B,
X8Y8_Uint = 0x1C,
X8Y8_Sint = 0x1D,
X8Y8_Srgb = 0x1E,
L8A8_Unorm = 0x1F,
X8Y8Z8W8_Unorm = 0x20,
X8Y8Z8W8_Snorm = 0x21,
X8Y8Z8W8_Uscaled = 0x22,
X8Y8Z8W8_Sscaled = 0x23,
X8Y8Z8W8_Uint = 0x24,
X8Y8Z8W8_Sint = 0x25,
X8Y8Z8W8_Srgb = 0x26,
U8V8_Snorm_L8W8_Unorm = 0x27, ///< Mixed signed/unsigned format. Valid Image and Color-Target View formats
/// are X8Y8Z8W8_Snorm (to target U8V8_Snorm) and X8Y8Z8W8_Unorm (to target
/// L8W8_Unorm).
X10Y11Z11_Float = 0x28,
X11Y11Z10_Float = 0x29,
X10Y10Z10W2_Unorm = 0x2A,
X10Y10Z10W2_Snorm = 0x2B,
X10Y10Z10W2_Uscaled = 0x2C,
X10Y10Z10W2_Sscaled = 0x2D,
X10Y10Z10W2_Uint = 0x2E,
X10Y10Z10W2_Sint = 0x2F,
X10Y10Z10W2Bias_Unorm = 0x30, ///< A four-component, 32-bit 2.8-biased fixed-point format that supports 10
/// bits for each color channel and 2-bit alpha. A shader must be aware of
/// *Bias* and must perform its own bias and scale on any data that is read
/// from or written.
U10V10W10_Snorm_A2_Unorm = 0X31, ///< Mixed signed/unsigned format. Valid Image and Color-Target View formats
/// are X10Y10Z10W2_Snorm (to target U10V10W10_Snorm) and X10Y10Z10W2_Unorm
/// (to target A2_Unorm).
X16_Unorm = 0x32,
X16_Snorm = 0x33,
X16_Uscaled = 0x34,
X16_Sscaled = 0x35,
X16_Uint = 0x36,
X16_Sint = 0x37,
X16_Float = 0x38,
L16_Unorm = 0x39,
X16Y16_Unorm = 0x3A,
X16Y16_Snorm = 0x3B,
X16Y16_Uscaled = 0x3C,
X16Y16_Sscaled = 0x3D,
X16Y16_Uint = 0x3E,
X16Y16_Sint = 0x3F,
X16Y16_Float = 0x40,
X16Y16Z16W16_Unorm = 0x41,
X16Y16Z16W16_Snorm = 0x42,
X16Y16Z16W16_Uscaled = 0x43,
X16Y16Z16W16_Sscaled = 0x44,
X16Y16Z16W16_Uint = 0x45,
X16Y16Z16W16_Sint = 0x46,
X16Y16Z16W16_Float = 0x47,
X32_Uint = 0x48,
X32_Sint = 0x49,
X32_Float = 0x4A,
X32Y32_Uint = 0x4B,
X32Y32_Sint = 0x4C,
X32Y32_Float = 0x4D,
X32Y32Z32_Uint = 0x4E,
X32Y32Z32_Sint = 0x4F,
X32Y32Z32_Float = 0x50,
X32Y32Z32W32_Uint = 0x51,
X32Y32Z32W32_Sint = 0x52,
X32Y32Z32W32_Float = 0x53,
D16_Unorm_S8_Uint = 0x54,
D32_Float_S8_Uint = 0x55,
X9Y9Z9E5_Float = 0x56, ///< Three partial-precision floating-point numbers encoded into a single 32-bit
/// value all sharing the same 5-bit exponent (variant of s10e5, which is sign
/// bit, 10-bit mantissa, and 5-bit biased (15) exponent). There is no sign
/// bit, and there is a shared 5-bit biased (15) exponent and a 9-bit mantissa
/// for each channelShared exponent format.
Bc1_Unorm = 0x57, ///< BC1 compressed texture format.
Bc1_Srgb = 0x58, ///< BC1 compressed texture format.
Bc2_Unorm = 0x59, ///< BC2 compressed texture format.
Bc2_Srgb = 0x5A, ///< BC2 compressed texture format.
Bc3_Unorm = 0x5B, ///< BC3 compressed texture format.
Bc3_Srgb = 0x5C, ///< BC3 compressed texture format.
Bc4_Unorm = 0x5D, ///< BC4 compressed texture format.
Bc4_Snorm = 0x5E, ///< BC4 compressed texture format.
Bc5_Unorm = 0x5F, ///< BC5 compressed texture format.
Bc5_Snorm = 0x60, ///< BC5 compressed texture format.
Bc6_Ufloat = 0x61, ///< BC6 unsigned compressed texture format.
Bc6_Sfloat = 0x62, ///< BC6 signed compressed texture format.
Bc7_Unorm = 0x63, ///< BC7 compressed texture format.
Bc7_Srgb = 0x64, ///< BC7 compressed texture format.
Etc2X8Y8Z8_Unorm = 0x65,
Etc2X8Y8Z8_Srgb = 0x66,
Etc2X8Y8Z8W1_Unorm = 0x67,
Etc2X8Y8Z8W1_Srgb = 0x68,
Etc2X8Y8Z8W8_Unorm = 0x69,
Etc2X8Y8Z8W8_Srgb = 0x6A,
Etc2X11_Unorm = 0x6B,
Etc2X11_Snorm = 0x6C,
Etc2X11Y11_Unorm = 0x6D,
Etc2X11Y11_Snorm = 0x6E,
AstcLdr4x4_Unorm = 0x6F,
AstcLdr4x4_Srgb = 0x70,
AstcLdr5x4_Unorm = 0x71,
AstcLdr5x4_Srgb = 0x72,
AstcLdr5x5_Unorm = 0x73,
AstcLdr5x5_Srgb = 0x74,
AstcLdr6x5_Unorm = 0x75,
AstcLdr6x5_Srgb = 0x76,
AstcLdr6x6_Unorm = 0x77,
AstcLdr6x6_Srgb = 0x78,
AstcLdr8x5_Unorm = 0x79,
AstcLdr8x5_Srgb = 0x7A,
AstcLdr8x6_Unorm = 0x7B,
AstcLdr8x6_Srgb = 0x7C,
AstcLdr8x8_Unorm = 0x7D,
AstcLdr8x8_Srgb = 0x7E,
AstcLdr10x5_Unorm = 0x7F,
AstcLdr10x5_Srgb = 0x80,
AstcLdr10x6_Unorm = 0x81,
AstcLdr10x6_Srgb = 0x82,
AstcLdr10x8_Unorm = 0x83,
AstcLdr10x8_Srgb = 0x84,
AstcLdr10x10_Unorm = 0x85,
AstcLdr10x10_Srgb = 0x86,
AstcLdr12x10_Unorm = 0x87,
AstcLdr12x10_Srgb = 0x88,
AstcLdr12x12_Unorm = 0x89,
AstcLdr12x12_Srgb = 0x8A,
AstcHdr4x4_Float = 0x8B,
AstcHdr5x4_Float = 0x8C,
AstcHdr5x5_Float = 0x8D,
AstcHdr6x5_Float = 0x8E,
AstcHdr6x6_Float = 0x8F,
AstcHdr8x5_Float = 0x90,
AstcHdr8x6_Float = 0x91,
AstcHdr8x8_Float = 0x92,
AstcHdr10x5_Float = 0x93,
AstcHdr10x6_Float = 0x94,
AstcHdr10x8_Float = 0x95,
AstcHdr10x10_Float = 0x96,
AstcHdr12x10_Float = 0x97,
AstcHdr12x12_Float = 0x98,
X8Y8_Z8Y8_Unorm = 0x99, ///< _Untested._
X8Y8_Z8Y8_Uscaled = 0x9A, ///< _Untested._
Y8X8_Y8Z8_Unorm = 0x9B, ///< _Untested._
Y8X8_Y8Z8_Uscaled = 0x9C, ///< _Untested._
AYUV = 0x9D, ///< YUV 4:4:4 packed format. Valid Image and Color-Target view formats are
/// { X8Y8Z8W8, Unorm } and { X8Y8Z8W8, Uint }. Each view fully maps the
/// entire YUV subresource, with the V,U,Y,A channels mapped to the X,Y,Z,W
/// channels respectively. Additionally, Image views can use the { X32, Uint }
/// format where all four channels are packed into a single uint32.
UYVY = 0x9E, ///< YUV 4:2:2 packed format. The Image data is subsampled such that each 32bit
/// element contains two Y samples and one U and V sample. Valid Image view
/// formats are { X8Y8Z8W8, Unorm } and { X8Y8Z8W8, Uint }. Each view fully
/// maps the entire YUV subresource, with the X,Y,Z,W channels mapped to the
/// U0,Y0,V0,Y1 channels respectively. Additionally, Image views can use the
/// { X32, Uint } format where all four channels are packed into a single
/// uint32. Image views can also use the { X8Y8_Z8Y8, Unorm } format to access
/// these as well. In this case, the width of the Image view would appear to be
/// twice as wide as it normally does, and the X0,Y0,Z0,Y1 channels map to the
/// U0,Y0,V0,Y1 channels respectively.
VYUY = 0x9F, ///< YUV 4:2:2 packed format. The image data is encoded just like the
/// @ref ChNumFormat::UYVY format, except with a different channel ordering.
/// Image views with X8Y8Z8W8 channel formats map the X,Y,Z,W channels to the
/// V0,Y0,U0,Y1 channels respectively. Image views with the X8Y8_Z8Y8 channel
/// format map the X0,Y0,Z0,Y1 channels to the V0,Y0,U0,Y1 channels
/// respectively.
YUY2 = 0xA0, ///< YUV 4:2:2 packed format. The image data is encoded just like the
/// @ref ChNumFormat::UYVY format, except with a different channel ordering.
/// X8Y8Z8W8 Image view formats map the X,Y,Z,W channels to the Y0,U0,Y1,V0
/// channels respectively. Image views can use the { Y8X8_Y8Z8, Unorm } format
/// where the Y0,X0,Y1,Z0 channels are mapped to the Y0,U0,Y1,V0 channels.
YVY2 = 0xA1, ///< YUV 4:2:2 packed format. The image data is encoded just like the
/// @ref ChNumFormat::YUY2 format, except with a different channel ordering.
/// X8Y8Z8W8 Image view formats map the X,Y,Z,W channels to the Y0,V0,Y1,U0
/// channels respectively. Image views can use the { Y8X8_Y8Z8, Unorm } format
/// where the Y0,X0,Y1,Z0 channels are mapped to the Y0,V0,Y1,U0 channels.
YV12 = 0xA2, ///< YVU 4:2:0 planar format, with 8 bits per luma and chroma sample. The Y
/// plane is first, containg a uint8 per sample. Next is the V plane and the U
/// plane, both of which have a uint8 per sample. Valid Image view formats are
/// { X8, Unorm } and { X8, Uint }. Each view only has access to one of the Y,
/// V, or U planes.
NV11 = 0xA3, ///< YUV 4:1:1 planar format, with 8 bits per luma and chroma sample. The Y
/// plane is first, containing a uint8 per sample. Next is a UV plane which
/// has interleaved U and V samples, each stored as a uint8. Valid Image and
/// Color-Target view formats are { X8, Unorm }, { X8, Uint }, { X8Y8, Unorm }
/// and { X8Y8, Uint }. When using an X8 channel format for the View, the view
/// only has access to the Y plane. When using X8Y8, the view only has access
/// to the UV plane.
NV12 = 0xA4, ///< YUV 4:2:0 planar format, with 8 bits per luma and chroma sample. The Y
/// plane is first, containing a uint8 per sample. Next is a UV plane which
/// has interleaved U and V samples, each stored as a uint8. Valid Image and
/// Color-Target view formats are { X8, Unorm }, { X8, Uint }, { X8Y8, Unorm }
/// and { X8Y8, Uint }. When using an X8 channel format for the View, the view
/// only has access to the Y plane. When using X8Y8, the view only has access
/// to the UV plane.
NV21 = 0xA5, ///< YUV 4:2:0 planar format, with 8 bits per luma and chroma sample. This is
/// identical to @ref ChNumFormat::NV12, except that the second plane swaps the
/// ordering of the U and V samples. Image views behave just like with
/// @ref ChNumFormat::NV12.
P016 = 0xA6, ///< YUV 4:2:0 planar format, with 16 bits per luma and chroma sample. The
/// plane ordering is identical to @ref ChNumFormat::NV12. Instead of uint8
/// samples, this format uses 8.8 fixed point sample encoding. Image views
/// behave just like with @ref ChNumFormat::NV12, except R16 channel formats
/// are used for the Y plane, and X16Y16 channel formats are used for the UV
/// plane.
P010 = 0xA7, ///< YUV 4:2:0 planar format, with 10 bits per luma and chroma sample. This is
/// identical to @ref ChNumFormat::P016, except that the lowest 6 bits of each
/// luma and chroma sample are ignored. This allows the source data to be
/// interpreted as either P016 or P010 interchangably.
P210 = 0xA8, ///< YUV 4:2:2 planar format, with 10 bits per luma and chroma sample. This is
/// similar to @ref ChNumFormat::P010, except that the UV planes are sub-sampled
/// only in the horizontal direction, but still by a factor of 2 so the UV plane
/// ends up having the same number of lines as the Y plane.
X8_MM_Unorm = 0xA9, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar
/// surfaces. Such as the Y plane or any plane in YV12.
X8_MM_Uint = 0xAA, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar
/// surfaces. Such as the Y plane or any plane in YV12.
X8Y8_MM_Unorm = 0xAB, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar
/// surfaces.
X8Y8_MM_Uint = 0xAC, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar
/// surfaces.
X16_MM10_Unorm = 0xAD, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar
/// surfaces (10-bit). Such as the Y plane or any plane in YV12.
X16_MM10_Uint = 0xAE, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar
/// surfaces (10-bit). Such as the Y plane or any plane in YV12.
X16Y16_MM10_Unorm = 0xAF, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar
/// surfaces (10-bit).
X16Y16_MM10_Uint = 0xB0, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar
/// surfaces (10-bit).
P208 = 0xB1, ///< YUV 4:2:2 planar format, with 8 bits per luma and chroma sample. This is
/// similar to @ref ChNumFormat::NV12, except that the UV planes are sub-sampled
/// only in the horizontal direction, but still by a factor of 2 so the UV plane
/// ends up having the same number of lines as the Y plane. This format is
/// sometimes referred to as NV16.
X16_MM12_Unorm = 0xB2, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar
/// surfaces (12-bit).
X16_MM12_Uint = 0xB3, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar
/// surfaces (12-bit).
X16Y16_MM12_Unorm = 0xB4, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar
/// surfaces (12-bit).
X16Y16_MM12_Uint = 0xB5, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar
/// surfaces (12-bit).
P012 = 0xB6, ///< YUV 4:2:0 planar format, with 12 bits per luma and chroma sample. This is
/// identical to @ref ChNumFormat::P010, except that the lowest 4 bits of each
/// luma and chroma sample are ignored.
P212 = 0xB7, ///< YUV 4:2:2 planar format, with 12 bits per luma and chroma sample. This is
/// identical to @ref ChNumFormat::P210, except that the lowest 4 bits of each
/// luma and chroma sample are ignored.
P412 = 0xB8, ///< YUV 4:4:4 planar format, with 12 bits per luma and chroma sample. It consists
/// of a Y-plane followed by an interleaved UV plane.
X10Y10Z10W2_Float = 0xB9, ///< RGBA format with three 10-bit floats (6e4) and a 2-bit unorm as alpha.
Y216 = 0xBA, ///< YUV 4:2:2 packed, with 16 bits per luma or chroma sample. No alpha.
Y210 = 0xBB, ///< YUV 4:2:2 packed, with 10 bits per luma or chroma sample. No alpha.
/// Same memory layout as @ref ChNumFormat::Y216.
/// The lowest 6 bits of each sample are ignored.
Y416 = 0xBC, ///< YUV 4:4:4 packed, with 16 bits per luma or chroma sample.
Y410 = 0xBD, ///< YUV 4:4:4 packed, with 10 bits per luma or chroma sample and 2 bits for alpha.
_ReservedBE = 0xBE,
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 923
P216 = 0xBF, ///< YUV 4:2:2 planar format, with 16 bits per luma and chroma sample. It consists
/// of a Y-plane followed by interleaved UV plane.
#endif
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924
YUV_420P10 = 0xC0, ///< YUV 4:2:0 tri-planar format, with 10 bits per luma and chroma sample.
YUV_422P10 = 0xC1, ///< YUV 4:2:2 tri-planar format, with 10 bits per luma and chroma sample.
YUV_444P10 = 0xC2, ///< YUV 4:4:4 tri-planar format, with 10 bits per luma and chroma sample.
YUV_420P12 = 0xC3, ///< YUV 4:2:0 tri-planar format, with 12 bits per luma and chroma sample.
YUV_422P12 = 0xC4, ///< YUV 4:2:2 tri-planar format, with 12 bits per luma and chroma sample.
YUV_444P12 = 0xC5, ///< YUV 4:4:4 tri-planar format, with 12 bits per luma and chroma sample.
YUV_420P16 = 0xC6, ///< YUV 4:2:0 tri-planar format, with 16 bits per luma and chroma sample.
YUV_422P16 = 0xC7, ///< YUV 4:2:2 tri-planar format, with 16 bits per luma and chroma sample.
YUV_444P16 = 0xC8, ///< YUV 4:4:4 tri-planar format, with 16 bits per luma and chroma sample.
#endif
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 925
YV16 = 0xC9, ///< YVU 4:2:2 tri-planar format, with 8 bits per luma and chroma sample. This
/// is similar to @ref ChNumFormat::YV12, except chroma is not subsampled in
/// vertical direction.
YV24 = 0xCA, ///< YVU 4:4:4 tri-planar format, with 8 bits per luma and chroma sample. This
/// is similar to @ref ChNumFormat::YV12, except chroma is not subsampled.
NV24 = 0xCB, ///< YUV 4:4:4 bi-planar format, with 8 bits per luma and chroma sample. This
/// is similar to @ref ChNumFormat::NV12, except chroma is not subsampled.
/// This format is sometimes referred to as P408.
P410 = 0xCC, ///< YUV 4:4:4 planar format, with 10 bits per luma and chroma sample. It consists
/// of a Y-plane followed by interleaved UV plane.
P416 = 0xCD, ///< YUV 4:4:4 planar format, with 16 bits per luma and chroma sample. It consists
/// of a Y-plane followed by interleaved UV plane.
#endif
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 929
X16Y16Z16W16_MM10_Unorm = 0xCE, ///< A four component format with 16 bits of storage per component. Lowest 6 bits of
/// each component are ignored.
X16Y16Z16W16_MM10_Uint = 0xCF, ///< A four component format with 16 bits of storage per component. Lowest 6 bits of
/// each component are ignored.
X16Y16Z16W16_MM12_Unorm = 0xD0, ///< A four component format with 16 bits of storage per component. Lowest 4 bits of
/// each component are ignored.
X16Y16Z16W16_MM12_Uint = 0xD1, ///< A four component format with 16 bits of storage per component. Lowest 4 bits of
/// each component are ignored.
#endif
Count,
};
/// Specifies which channel of a resource should be mapped to a particular component of an image view.
///
/// @ingroup ResourceBinding
enum class ChannelSwizzle : Util::uint8
{
Zero = 0x0, ///< Ignore resource data and always fetch a 0 into this component.
One = 0x1, ///< Ignore resource data and always fetch a 1 into this component.
X = 0x2, ///< Use the X channel from resource for this component.
Y = 0x3, ///< Use the Y channel from resource for this component.
Z = 0x4, ///< Use the Z channel from resource for this component.
W = 0x5, ///< Use the W channel from resource for this component.
Count
};
/// Specifies a mapping for each component of an image or buffer view to a channel in its associated resource.
///
/// @ingroup ResourceBinding
struct ChannelMapping
{
union
{
struct
{
ChannelSwizzle r; ///< Red component swizzle.
ChannelSwizzle g; ///< Green component swizzle.
ChannelSwizzle b; ///< Blue component swizzle.
ChannelSwizzle a; ///< Alpha component swizzle.
};
ChannelSwizzle swizzle[4]; ///< All four swizzles packed into one array.
Util::uint32 swizzleValue;
};
};
/// Specifies a pixel format for an image or memory view and its corresponding channel swizzle.
struct SwizzledFormat
{
ChNumFormat format; ///< Pixel format.
ChannelMapping swizzle; ///< Compatible channel swizzle for the above pixel format.
};
inline constexpr bool operator==(const SwizzledFormat& lhs, const SwizzledFormat& rhs)
{
return (lhs.format == rhs.format) && (lhs.swizzle.swizzleValue == rhs.swizzle.swizzleValue);
}
/// Constant for undefined formats.
constexpr SwizzledFormat UndefinedSwizzledFormat =
{
ChNumFormat::Undefined,
{ { { ChannelSwizzle::X, ChannelSwizzle::Zero, ChannelSwizzle::Zero, ChannelSwizzle::One } } },
};
/// Flags structure reporting available capabilities of a particular format.
enum FormatFeatureFlags : Util::uint32
{
FormatFeatureCopy = 0x00001, ///< Images of this format can be used as a copy source or destination.
FormatFeatureFormatConversion = 0x00002, ///< Images of this format support format conversion in copy
/// operations.
FormatFeatureImageShaderRead = 0x00004, ///< Images of this format can be read from a shader.
FormatFeatureImageShaderWrite = 0x00008, ///< Images of this format can be written from a shader.
FormatFeatureImageShaderAtomics = 0x00010, ///< Images of this format can be written atomically from a shader.
FormatFeatureMemoryShaderRead = 0x00020, ///< Memory views of this format can be read from a shader.
FormatFeatureMemoryShaderWrite = 0x00040, ///< Memory views of this format can be written from a shader.
FormatFeatureMemoryShaderAtomics = 0x00080, ///< Memory views of this format can be written atomically from a
/// shader.
FormatFeatureColorTargetWrite = 0x00100, ///< Images of this format can be bound as a color target.
FormatFeatureColorTargetBlend = 0x00200, ///< Images of this format can be bound as a color target for blending.
FormatFeatureDepthTarget = 0x00400, ///< Images of this format can be bound as a depth target.
FormatFeatureStencilTarget = 0x00800, ///< Images of this format can be bound as a stencil target.
FormatFeatureMsaaTarget = 0x01000, ///< Images of this format can support multisampling.
FormatFeatureWindowedPresent = 0x02000, ///< Images of this format can support windowed-mode presents.
/// Fullscreen present capability is queried using the @ref
/// IScreen::GetScreenModeList method.
FormatFeatureImageFilterLinear = 0x04000, ///< Images of this format can be linearly filtered.
FormatFeatureImageFilterMinMax = 0x08000, ///< Images of this format can be min/max filtered.
FormatFeatureFormatConversionSrc = 0x10000, ///< Images of this format support format conversion in copy
/// operations as the source image.
/// @note This is aliased to FormatFeatureFormatConversionDst for
/// backwards compatibility.
FormatFeatureFormatConversionDst = 0x20000, ///< Images of this format support format conversion in copy
/// operations as the destination image.
/// @note This is aliased to FormatFeatureFormatConversionSrc for
/// backwards compatibility.
};
/// Enumeration for indexing into the format properties table based on tiling.
enum FormatPropertiesTiling : Util::uint32
{
IsLinear = 0, ///< Format properties requested is for linearly-tiled surfaces.
IsNonLinear, ///< Format properties requested is for non-linearly tiled surfaces.
Count, ///< Number of format property tile types.
};
/// The format properties lookup table. Contains information about which device access features are available for all
/// formats and tiling modes. The tiling features for non-linear tiling modes are identical so we only store linear
/// and non-linear tiling features. From left to right, it is indexed by format and "is-non-linear".
/// Returned by IDevice::GetFormatProperties().
struct MergedFormatPropertiesTable
{
FormatFeatureFlags features[static_cast<size_t>(ChNumFormat::Count)][FormatPropertiesTiling::Count];
};
} // Pal
@@ -0,0 +1,881 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palFormatInfo.h
* @brief Defines the Platform Abstraction Library (PAL) Format utility functions.
***********************************************************************************************************************
*/
#pragma once
#include "palDevice.h"
#include "palImage.h"
#include "palInlineFuncs.h"
#include "palMath.h"
namespace Pal
{
/// Namespace encapsulating all PAL format utility functions.
namespace Formats
{
/// Specifies flags which indicate properties of each PAL channel format.
enum PropertyFlags : uint32
{
BitCountInaccurate = 0x1, ///< Indicates that format's bit count array is inaccurate
BlockCompressed = 0x2, ///< Indicates channel format is block-compressed
MacroPixelPacked = 0x4, ///< Indicates channel format has multiple pixels' data packed together into
/// one "macro pixel"
YuvPlanar = 0x8, ///< Indicates channel format is YUV-planar
YuvPacked = 0x10, ///< Indicates channel format is YUV packed
};
/// Specifies numeric support of a specified format.
enum class NumericSupportFlags : uint32
{
Undefined, ///< No numeric support.
Unorm, ///< Unsigned normalized.
Snorm, ///< Signed normalized.
Uscaled, ///< _Untested._ Treated as an unsigned integer inside the resource, but received by
/// the shader as a floating point number.
Sscaled, ///< _Untested._ Treated as a signed integer inside the resource, but received by
/// the shader as a floating point number.
Uint, ///< Unsigned integer.
Sint, ///< Signed integer.
Float, ///< Floating point number.
Srgb, ///< sRGB.
DepthStencil, ///< Depth/stencil support.
Yuv, ///< YUV support.
};
/// Specifies flags which indicate the presence of each color channel in a PAL channel format.
enum ChannelFlags : uint32
{
X = 0x1, ///< Indicates the X channel is present.
Y = 0x2, ///< Indicates the Y channel is present.
Z = 0x4, ///< Indicates the Z channel is present.
W = 0x8, ///< Indicates the W channel is present.
};
/// An entry in the channel-format info lookup table. Contains intrinsic properties describing a channel format.
struct FormatInfo
{
uint32 bitsPerPixel; ///< Total count of bits in a signel pixel (or block).
uint32 componentCount; ///< Number of color components (channels) present.
uint32 bitCount[4]; ///< Number of bits for each component in the format. These members are
/// only reliable if the 'bitCountInaccurate' flag is not set.
/// Listed in order: X, Y, Z, and W.
uint32 channelMask; ///< Mask of @ref ChannelFlags values indicating which channels are present.
uint32 properties; ///< Mask of @ref PropertyFlags values indicating which properties a format
/// has.
NumericSupportFlags numericSupport; ///< Which numeric format this format represents. Used for easy identification.
};
/// BC block dimension (4x4)
static constexpr uint32 CompressedBcBlockDim = 4;
/// ETC block dimension (4x4)
static constexpr uint32 CompressedEtcBlockDim = 4;
/// Lookup table for intrinsic properties describing each channel format. Callers should access the members of this
/// table via BitsPerPixel() and related functions.
extern const FormatInfo FormatInfoTable[static_cast<size_t>(ChNumFormat::Count)];
/// Convert a floating-point representation of a color value in RGBA order to the appropriate bit representation for
/// each channel based on the specified format. Swizzling is enabled by default to maintain backwards compatability.
/// There will be no swizzling functionality going forwards.
extern void ConvertColor(
SwizzledFormat format,
const float* pColorIn,
uint32* pColorOut);
/// Convert an unsigned integer representation of a color value in YUVA order to the appropriate bit representation for
/// each channel based on the specified format.
extern void ConvertYuvColor(
SwizzledFormat format,
uint32 plane,
const uint32* pColorIn,
uint32* pColorOut);
/// Packs a clear color value in RGBA order to a single element of the provided format and stores it in the
/// memory provided. Swizzling is enabled by default to maintain backwards compatability. There will be
/// no swizzling functionality going forwards.
extern void PackRawClearColor(
SwizzledFormat format,
const uint32* pColor,
void* pBufferMemory);
/// Swizzles the color according to the provided format swizzle.
extern void SwizzleColor(SwizzledFormat format, const uint32* pColorIn, uint32* pColorOut);
/// Compares two SwizzledFormats and checks for equality.
///
/// @param lhs [in] Left hand side of comparison
/// @param rhs [in] Right hand side of comparison
///
/// @return True if the formats are equal, false otherwise.
constexpr bool IsSameFormat(
const SwizzledFormat& lhs,
const SwizzledFormat& rhs)
{
return ((lhs.format == rhs.format) && (lhs.swizzle.swizzleValue == rhs.swizzle.swizzleValue));
}
/// Queries the number of components for a particular channel format.
///
/// @param [in] format The channel format to query for.
///
/// @returns The number of components of the specified channel format.
inline uint32 NumComponents(
ChNumFormat format)
{
return FormatInfoTable[static_cast<size_t>(format)].componentCount;
}
/// Queries the component mask for a particular format.
///
/// @param [in] format The format to query for.
///
/// @returns The component mask of @ref ChannelFlags for the specified format.
inline uint32 ComponentMask(
ChNumFormat format)
{
uint32 mask = FormatInfoTable[static_cast<size_t>(format)].channelMask;
PAL_ASSERT((mask & 0xF) == mask);
return mask;
}
/// Checks if a format is undefined.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is undefined. False otherwise.
constexpr bool IsUndefined(
ChNumFormat format)
{
return (format == ChNumFormat::Undefined);
}
/// Checks if a format's numeric representation is unsigned normalized.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is unsigned normalized. False otherwise.
inline bool IsUnorm(
ChNumFormat format)
{
return (FormatInfoTable[static_cast<size_t>(format)].numericSupport == NumericSupportFlags::Unorm);
}
/// Checks if a format's numeric representation is signed normalized.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is signed normalized. False otherwise.
inline bool IsSnorm(
ChNumFormat format)
{
return (FormatInfoTable[static_cast<size_t>(format)].numericSupport == NumericSupportFlags::Snorm);
}
/// Checks if a format's numeric representation is unsigned scaled.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is unsigned scaled. False otherwise.
inline bool IsUscaled(
ChNumFormat format)
{
return (FormatInfoTable[static_cast<size_t>(format)].numericSupport == NumericSupportFlags::Uscaled);
}
/// Checks if a format's numeric representation is signed scaled.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is signed scaled. False otherwise.
inline bool IsSscaled(
ChNumFormat format)
{
return (FormatInfoTable[static_cast<size_t>(format)].numericSupport == NumericSupportFlags::Sscaled);
}
/// Checks if a format's numeric representation is unsigned integer.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is unsigned integer. False otherwise.
inline bool IsUint(
ChNumFormat format)
{
return (FormatInfoTable[static_cast<size_t>(format)].numericSupport == NumericSupportFlags::Uint);
}
/// Checks if a format's numeric representation is signed integer.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is signed integer. False otherwise.
inline bool IsSint(
ChNumFormat format)
{
return (FormatInfoTable[static_cast<size_t>(format)].numericSupport == NumericSupportFlags::Sint);
}
/// Checks if a format's numeric representation is floating point.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is floating point. False otherwise.
inline bool IsFloat(
ChNumFormat format)
{
return (FormatInfoTable[static_cast<size_t>(format)].numericSupport == NumericSupportFlags::Float);
}
/// Checks if a format's numeric representation is gamma-corrected sRGB.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is sRGB. False otherwise.
inline bool IsSrgb(
ChNumFormat format)
{
return (FormatInfoTable[static_cast<size_t>(format)].numericSupport == NumericSupportFlags::Srgb);
}
/// Checks if a format's numeric representation is normalized.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is normalized. False otherwise.
inline bool IsNormalized(
ChNumFormat format)
{
return IsUnorm(format) || IsSnorm(format);
}
/// Checks if a format's numeric representation is an integer format.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is an integer format. False otherwise.
inline bool IsInteger(
ChNumFormat format)
{
return IsUint(format) || IsSint(format);
}
/// Checks if a format is a depth/stencil only format.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is a depth/stencil only format. False otherwise.
inline bool IsDepthStencilOnly(
ChNumFormat format)
{
return (FormatInfoTable[static_cast<size_t>(format)].numericSupport == NumericSupportFlags::DepthStencil);
}
/// Checks if the specified format is one of the YUV-planar ones.
///
/// @param [in] format The format to check.
///
/// @returns True if the specified format is YUV-planar. False otherwise.
inline bool IsYuvPlanar(
ChNumFormat format)
{
return ((FormatInfoTable[static_cast<size_t>(format)].properties & YuvPlanar) != 0);
}
/// Checks if the specified format is one of the YUV-packed ones.
///
/// @param [in] format The format to check.
///
/// @returns True if the specified format is YUV-packed. False otherwise.
inline bool IsYuvPacked(
ChNumFormat format)
{
return ((FormatInfoTable[static_cast<size_t>(format)].properties & YuvPacked) != 0);
}
/// Checks if the specified format is one of the YUV ones.
///
/// @param [in] format The format to check.
///
/// @returns True if the specified format is for YUV data. False otherwise.
inline bool IsYuv(
ChNumFormat format)
{
return (FormatInfoTable[static_cast<size_t>(format)].numericSupport == NumericSupportFlags::Yuv);
}
/// Checks if a format has alpha.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format has an alpha channel. False otherwise.
constexpr bool HasAlpha(
SwizzledFormat format)
{
return ((format.format == ChNumFormat::A8_Unorm) ||
(format.format == ChNumFormat::L4A4_Unorm) ||
(format.format == ChNumFormat::L8A8_Unorm) ||
((ComponentMask(format.format) & ChannelFlags::W) != 0) ||
((format.swizzle.a != ChannelSwizzle::Zero) && (format.swizzle.a != ChannelSwizzle::One)));
}
/// Checks if a format has an unused alpha channel.
///
/// @param [in] format Pixel format.
///
/// @returns True if the pixel format is a four channel format and has an unused alpha channel. False otherwise.
inline bool HasUnusedAlpha(
SwizzledFormat format)
{
return ((NumComponents(format.format) == 4) &&
(format.swizzle.r != ChannelSwizzle::W) &&
(format.swizzle.g != ChannelSwizzle::W) &&
(format.swizzle.b != ChannelSwizzle::W) &&
(format.swizzle.a != ChannelSwizzle::W));
}
/// Converts format into its Unorm equivalent.
///
/// @param [in] format Pixel format.
///
/// @returns Format equivalent of input format. Undefined if none exist.
extern ChNumFormat PAL_STDCALL ConvertToUnorm(ChNumFormat format);
/// Converts format into its Snorm equivalent.
///
/// @param [in] format Pixel format.
///
/// @returns Format equivalent of input format. Undefined if none exist.
extern ChNumFormat PAL_STDCALL ConvertToSnorm(ChNumFormat format);
/// Converts format into its Uscaled equivalent.
///
/// @param [in] format Pixel format.
///
/// @returns Format equivalent of input format. Undefined if none exist.
extern ChNumFormat PAL_STDCALL ConvertToUscaled(ChNumFormat format);
/// Converts format into its Sscaled equivalent.
///
/// @param [in] format Pixel format.
///
/// @returns Format equivalent of input format. Undefined if none exist.
extern ChNumFormat PAL_STDCALL ConvertToSscaled(ChNumFormat format);
/// Converts format into its Uint equivalent.
///
/// @param [in] format Pixel format.
///
/// @returns Uint format equivalent of input format. Undefined if none exist.
extern ChNumFormat PAL_STDCALL ConvertToUint(ChNumFormat format);
/// Converts format into its Sint equivalent.
///
/// @param [in] format Pixel format.
///
/// @returns Sint format equivalent of input format. Undefined if none exist.
extern ChNumFormat PAL_STDCALL ConvertToSint(ChNumFormat format);
/// Converts format into its Float equivalent.
///
/// @param [in] format Pixel format.
///
/// @returns Float format equivalent of input format. Undefined if none exist.
extern ChNumFormat PAL_STDCALL ConvertToFloat(ChNumFormat format);
/// Converts format into its Srgb equivalent.
///
/// @param [in] format Pixel format.
///
/// @returns Srgb format equivalent of input format. Undefined if none exist.
extern ChNumFormat PAL_STDCALL ConvertToSrgb(ChNumFormat format);
/// Converts source numeric format to the provided destination numeric format.
///
/// @param [in] srcFormat Source Pixel format.
/// @param [in] dstFormat Destination Pixel format.
///
/// @returns Source format with equivalent numeric format of destination format. Undefined if none exist.
extern ChNumFormat ConvertToDstNumFmt(ChNumFormat srcFormat, ChNumFormat dstFormat);
/// Determines whether the srcFormat and the dstFormat have the same channel formats.
///
/// @param [in] srcFormat Source channel pixel format.
/// @param [in] dstFormat Destination channel pixel format.
///
/// @returns True if both formats share the same channel format. False otherwise.
extern bool ShareChFmt(ChNumFormat srcFormat, ChNumFormat dstFormat);
/// Determines whether the srcFormat and the dstFormat have the same numeric formats.
///
/// @param [in] srcFormat Source channel pixel format.
/// @param [in] dstFormat Destination channel pixel format.
///
/// @returns True if both formats share the same numeric format. False otherwise.
inline bool HaveSameNumFmt(
ChNumFormat srcFormat,
ChNumFormat dstFormat)
{
return (FormatInfoTable[static_cast<size_t>(srcFormat)].numericSupport ==
FormatInfoTable[static_cast<size_t>(dstFormat)].numericSupport);
}
/// Returns the block dimension for a compressed format.
///
/// @param [in] format Format.
///
/// @returns Corresponding block dimensions for the compressed format.
inline Extent3d CompressedBlockDim(
ChNumFormat format)
{
Extent3d blockDim = {};
switch (format)
{
case ChNumFormat::Bc1_Unorm:
case ChNumFormat::Bc1_Srgb:
case ChNumFormat::Bc2_Unorm:
case ChNumFormat::Bc2_Srgb:
case ChNumFormat::Bc3_Unorm:
case ChNumFormat::Bc3_Srgb:
case ChNumFormat::Bc4_Unorm:
case ChNumFormat::Bc4_Snorm:
case ChNumFormat::Bc5_Unorm:
case ChNumFormat::Bc5_Snorm:
case ChNumFormat::Bc6_Ufloat:
case ChNumFormat::Bc6_Sfloat:
case ChNumFormat::Bc7_Unorm:
case ChNumFormat::Bc7_Srgb:
blockDim.width = CompressedBcBlockDim;
blockDim.height = CompressedBcBlockDim;
blockDim.depth = 1;
break;
case ChNumFormat::Etc2X8Y8Z8_Unorm:
case ChNumFormat::Etc2X8Y8Z8_Srgb:
case ChNumFormat::Etc2X8Y8Z8W1_Unorm:
case ChNumFormat::Etc2X8Y8Z8W1_Srgb:
case ChNumFormat::Etc2X8Y8Z8W8_Unorm:
case ChNumFormat::Etc2X8Y8Z8W8_Srgb:
case ChNumFormat::Etc2X11_Unorm:
case ChNumFormat::Etc2X11_Snorm:
case ChNumFormat::Etc2X11Y11_Unorm:
case ChNumFormat::Etc2X11Y11_Snorm:
blockDim.width = CompressedEtcBlockDim;
blockDim.height = CompressedEtcBlockDim;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr4x4_Unorm:
case ChNumFormat::AstcLdr4x4_Srgb:
case ChNumFormat::AstcHdr4x4_Float:
blockDim.width = 4;
blockDim.height = 4;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr5x4_Unorm:
case ChNumFormat::AstcLdr5x4_Srgb:
case ChNumFormat::AstcHdr5x4_Float:
blockDim.width = 5;
blockDim.height = 4;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr5x5_Unorm:
case ChNumFormat::AstcLdr5x5_Srgb:
case ChNumFormat::AstcHdr5x5_Float:
blockDim.width = 5;
blockDim.height = 5;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr6x5_Unorm:
case ChNumFormat::AstcLdr6x5_Srgb:
case ChNumFormat::AstcHdr6x5_Float:
blockDim.width = 6;
blockDim.height = 5;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr6x6_Unorm:
case ChNumFormat::AstcLdr6x6_Srgb:
case ChNumFormat::AstcHdr6x6_Float:
blockDim.width = 6;
blockDim.height = 6;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr8x5_Unorm:
case ChNumFormat::AstcLdr8x5_Srgb:
case ChNumFormat::AstcHdr8x5_Float:
blockDim.width = 8;
blockDim.height = 5;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr8x6_Unorm:
case ChNumFormat::AstcLdr8x6_Srgb:
case ChNumFormat::AstcHdr8x6_Float:
blockDim.width = 8;
blockDim.height = 6;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr8x8_Unorm:
case ChNumFormat::AstcLdr8x8_Srgb:
case ChNumFormat::AstcHdr8x8_Float:
blockDim.width = 8;
blockDim.height = 8;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr10x5_Unorm:
case ChNumFormat::AstcLdr10x5_Srgb:
case ChNumFormat::AstcHdr10x5_Float:
blockDim.width = 10;
blockDim.height = 5;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr10x6_Unorm:
case ChNumFormat::AstcLdr10x6_Srgb:
case ChNumFormat::AstcHdr10x6_Float:
blockDim.width = 10;
blockDim.height = 6;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr10x8_Unorm:
case ChNumFormat::AstcLdr10x8_Srgb:
case ChNumFormat::AstcHdr10x8_Float:
blockDim.width = 10;
blockDim.height = 8;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr10x10_Unorm:
case ChNumFormat::AstcLdr10x10_Srgb:
case ChNumFormat::AstcHdr10x10_Float:
blockDim.width = 10;
blockDim.height = 10;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr12x10_Unorm:
case ChNumFormat::AstcLdr12x10_Srgb:
case ChNumFormat::AstcHdr12x10_Float:
blockDim.width = 12;
blockDim.height = 10;
blockDim.depth = 1;
break;
case ChNumFormat::AstcLdr12x12_Unorm:
case ChNumFormat::AstcLdr12x12_Srgb:
case ChNumFormat::AstcHdr12x12_Float:
blockDim.width = 12;
blockDim.height = 12;
blockDim.depth = 1;
break;
default:
// This function should not be called on a non-compressed format.
PAL_ASSERT_ALWAYS();
break;
}
return blockDim;
}
/// Convert a compressed format block coordinate to texels.
///
/// @param [in] format Format.
/// @param [in] width Block width.
/// @param [in] height Block height.
/// @param [in] depth Block depth.
///
/// @returns Structure containing the texel width, height and depth
inline Extent3d CompressedBlocksToTexels(
ChNumFormat format,
uint32 width,
uint32 height,
uint32 depth)
{
Extent3d dims = CompressedBlockDim(format);
dims.width *= width;
dims.height *= height;
dims.depth *= depth;
return dims;
}
/// Convert a compressed format texel coordinate to blocks.
///
/// @param [in] format Format.
/// @param [in] width Texel width.
/// @param [in] height Texel height.
/// @param [in] depth Texel depth.
///
/// @returns Structure containing the block width, height and depth
inline Extent3d CompressedTexelsToBlocks(
ChNumFormat format,
uint32 width,
uint32 height,
uint32 depth)
{
Extent3d dims = CompressedBlockDim(format);
dims.width = Util::RoundUpQuotient(width, dims.width);
dims.height = Util::RoundUpQuotient(height, dims.height);
dims.depth = Util::RoundUpQuotient(depth, dims.depth);
return dims;
}
/// Queries the number of bits in a pixel or element for the given format.
///
/// @param format The format to query for.
///
/// @return The number of bits per pixel for the given channel format.
inline uint32 BitsPerPixel(
ChNumFormat format)
{
return FormatInfoTable[static_cast<size_t>(format)].bitsPerPixel;
}
/// Queries the number of bits in a pixel or element for the given format.
///
/// @param format The format to query for.
///
/// @return The number of bytes per pixel for the given channel format.
inline uint32 BytesPerPixel(
ChNumFormat format)
{
return (BitsPerPixel(format) >> 3);
}
/// Checks if the specified channel swizzle is allowed with the given format.
///
/// @param [in] format The pixel format to check against.
/// @param [in] swizzle The specified channel swizzle to check with.
///
/// @returns True if the specified channel swizzle is valid for the given format. False otherwise.
inline bool IsValidChannelSwizzle(
ChNumFormat format,
ChannelSwizzle swizzle)
{
const uint32 mask = ComponentMask(format);
bool valid = false;
switch (swizzle)
{
case ChannelSwizzle::Zero:
case ChannelSwizzle::One:
valid = true;
break;
case ChannelSwizzle::X:
valid = ((mask & ChannelFlags::X) != 0);
break;
case ChannelSwizzle::Y:
valid = ((mask & ChannelFlags::Y) != 0);
break;
case ChannelSwizzle::Z:
valid = ((mask & ChannelFlags::Z) != 0);
break;
case ChannelSwizzle::W:
valid = ((mask & ChannelFlags::W) != 0);
break;
default:
PAL_NEVER_CALLED();
break;
}
return valid;
}
/// Queries the per-component bit counts for a particular format.
///
/// @param [in] format The format to query for.
///
/// @returns The corresponding component swizzles for the specified format. Returned as an array of four counts.
inline const uint32* ComponentBitCounts(
ChNumFormat format)
{
return &FormatInfoTable[static_cast<size_t>(format)].bitCount[0];
}
/// Determines the maximum bit-count of any component in the format.
///
/// @param [in] format The channel format to query for.
///
/// @returns The maximum bit-count of any component in the format.
inline uint32 MaxComponentBitCount(
ChNumFormat format)
{
const FormatInfo& info = FormatInfoTable[static_cast<size_t>(format)];
return Util::Max(Util::Max(info.bitCount[0], info.bitCount[1]), Util::Max(info.bitCount[2], info.bitCount[3]));
}
/// Checks if the specified format is one of the block-compressed ones.
///
/// @param [in] format The format to check.
///
/// @returns True if the specified format is block-compressed. False otherwise.
inline bool IsBlockCompressed(
ChNumFormat format)
{
return ((FormatInfoTable[static_cast<size_t>(format)].properties & BlockCompressed) != 0);
}
/// Checks if the specified format is one of the macro-pixel-packed ones.
///
/// @param [in] format The format to check.
///
/// @returns True if the specified format is macro-pixel-packed. False otherwise.
inline bool IsMacroPixelPacked(
ChNumFormat format)
{
return ((FormatInfoTable[static_cast<size_t>(format)].properties & MacroPixelPacked) != 0);
}
/// Checks if the specified format is one of the rgb macro-pixel-packed ones.
///
/// @param [in] format The format to check.
///
/// @returns True if the specified format is a rgb macro-pixel-packed. False otherwise.
inline bool IsMacroPixelPackedRgbOnly(
ChNumFormat format)
{
return (IsMacroPixelPacked(format) && (IsYuv(format) == false));
}
/// Returns the base-2 logarithm of of the subsampling ratio between the luma plane and chroma plane(s) of a YUV planar
/// format. The dimensions of the luma plane should be right-shifted by these amounts to determine the dimensions of the
/// chroma plane(s).
///
/// @param [in] format Format.
/// @param [in] plane Image plane to query for.
///
/// @returns Corresponding scaling factors between the luma plane and chroma plane(s).
inline Extent3d Log2SubsamplingRatio(
ChNumFormat format,
uint32 plane)
{
// All planes for formats which are not YUV planar, and the 0th plane of a YUV planar format (the luma plane) are
// sampled at full rate, so the ratio is { log2(1), log2(1), log2(1) }, which equates to { 0,0,0 }.
Extent3d ratio = { };
if (IsYuvPlanar(format) && (plane != 0))
{
PAL_ASSERT((plane == 1) || (plane == 2));
switch (format)
{
// 4:4:4 formats have the same number of samples in every direction.
case ChNumFormat::P412:
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924
case ChNumFormat::YUV_444P10:
case ChNumFormat::YUV_444P12:
case ChNumFormat::YUV_444P16:
#endif
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 925
case ChNumFormat::YV24:
case ChNumFormat::NV24:
case ChNumFormat::P410:
case ChNumFormat::P416:
#endif
break;
// 4:2:0 formats have 1/2 as many samples in both the horizontal and vertical directions.
case ChNumFormat::YV12:
case ChNumFormat::NV12:
case ChNumFormat::NV21:
case ChNumFormat::P010:
case ChNumFormat::P012:
case ChNumFormat::P016:
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924
case ChNumFormat::YUV_420P10:
case ChNumFormat::YUV_420P12:
case ChNumFormat::YUV_420P16:
#endif
ratio.width = 1; // log2(1/2) = -1
ratio.height = 1;
break;
// 4:2:2 formats have 1/2 as many samples in the horizontal direction, and the same number of samples
// in the vertical direction.
case ChNumFormat::P208:
case ChNumFormat::P210:
case ChNumFormat::P212:
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 923
case ChNumFormat::P216:
#endif
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924
case ChNumFormat::YUV_422P10:
case ChNumFormat::YUV_422P12:
case ChNumFormat::YUV_422P16:
#endif
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 925
case ChNumFormat::YV16:
#endif
ratio.width = 1;
break;
// 4:1:1 formats have 1/4 as many samples in the horizontal direction, and the same number of samples
// in the vertical direction.
case ChNumFormat::NV11:
ratio.width = 2; // log2(1/4) = -2
break;
default:
PAL_NEVER_CALLED(); // Did we miss a new YUV planar format?
break;
}
}
return ratio;
}
/// Converts a linearly-scaled color value to gamma-corrected sRGB.
///
/// @param [in] linear Linear color value
///
/// @returns Gamma-corrected sRGB color value
extern float LinearToGamma(float linear);
/// Converts a gamma-corrected sRGB color value to linear color space.
///
/// @param [in] gammaCorrectedVal Gamma-corrected sRGB color value
///
/// @returns Linear color value
extern float GammaToLinear(float gammaCorrectedVal);
/// Checks to see if a given format is a MM format
///
///
/// @returns bool is it an MM format
extern bool IsMmFormat(ChNumFormat format);
/// Checks to see if a given format is a MM12 format
///
///
/// @returns bool is it an MM12 format
extern bool IsMm12Format(ChNumFormat format);
/// Checks to see if a given format is a MM10 format
///
///
/// @returns bool is it an MM10 format
extern bool IsMm10Format(ChNumFormat format);
} // Formats
} // Pal
@@ -0,0 +1,711 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palGpuMemory.h
* @brief Defines the Platform Abstraction Library (PAL) IGpuMemory interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palDestroyable.h"
#if defined(_WIN32)
struct _SECURITY_ATTRIBUTES;
#endif
namespace Pal
{
// Forward declarations.
class IGpuMemory;
class IDevice;
class IImage;
enum class VaRange : uint32;
/// Specifies Base Level priority per GPU memory allocation as a hint to the memory manager in the event it needs to
/// select allocations to page out of their preferred heaps.
enum class GpuMemPriority : uint32
{
Unused = 0x0, ///< Indicates that the allocation is not currently being used at all, and should be the first
/// choice to be paged out.
VeryLow = 0x1, ///< Lowest priority to keep in its preferred heap.
Low = 0x2, ///< Low priority to keep in its preferred heap.
Normal = 0x3, ///< Normal priority to keep in its preferred heap.
High = 0x4, ///< High priority to keep in its preferred heap (e.g., render targets).
VeryHigh = 0x5, ///< Highest priority to keep in its preferred heap. Last choice to be paged out (e.g., page
/// tables, displayable allocations).
Count
};
/// Specifies a finer granularity to the base Level priority per GPU memory allocation as a hint to the memory manager
/// in the event it needs to select allocations to page out of their preferred heaps.
enum class GpuMemPriorityOffset : uint32
{
Offset0 = 0x0, ///< Same priority as Base Level
Offset1 = 0x1, ///< Next priority from Base Level
Offset2 = 0x2, ///< Next priority from Base Level.
Offset3 = 0x3, ///< Next priority from Base Level.
Offset4 = 0x4, ///< Next priority from Base Level.
Offset5 = 0x5, ///< Next priority from Base Level.
Offset6 = 0x6, ///< Next priority from Base Level
Offset7 = 0x7, ///< Highest priority from Base Level
Count
};
/// Speicfies access mode for unmapped pages in a virtual Gpu Memory.
enum class VirtualGpuMemAccessMode : uint32
{
Undefined = 0x0, ///< Used in situations where no special accessMode needed.
NoAccess = 0x1, ///< All accesses of unmapped pages will trigger a GPU page fault.
ReadZero = 0x2, ///< Reads of unmapped pages return zero, and writes are discarded.
Count
};
/// Controls the behavior of this allocation with respect to the MALL.
enum class GpuMemMallPolicy : uint32
{
Default = 0x0, ///< MALL policy is decided by the driver.
Never = 0x1, ///< This allocation is never put through the MALL.
Always = 0x2, ///< This allocation is always put through the MALL.
};
/// Bitmask of cases where RPM view memory accesses will bypass the MALL.
enum RpmViewsBypassMall : uint32
{
RpmViewsBypassMallOff = 0x0, ///< Disable MALL bypass
RpmViewsBypassMallOnRead = 0x1, ///< Skip MALL for read access of views created in RPM
RpmViewsBypassMallOnWrite = 0x2, ///< Skip MALL for write access of views created in RPM
RpmViewsBypassMallOnCbDbWrite = 0x4, ///< Control the RPM CB/DB behavior
};
/// Used for specifying a subregion of the allocation as having a different mall policy from the rest of the
/// allocation.
struct GpuMemMallRange
{
uint32 startPage; ///< Starting 4k page that will obey the specified mallPolicy.
uint32 numPages; ///< Number of 4k pages that will obey the specified mallPolicy.
};
/// Specifies flags for @ref IGpuMemory creation.
union GpuMemoryCreateFlags
{
struct
{
uint64 virtualAlloc : 1; ///< Create a _virtual_ as opposed to _real_ GPU memory allocation.
/// Only VA space will be allocated, and pages must be mapped via
/// IQueue::RemapVirtualMemoryPages().
uint64 shareable : 1; ///< Memory can be shared between devices in the same process that
/// report the sharedMemory flag from
/// IDevice::GetMultiGpuCompatibility().
uint64 interprocess : 1; ///< Memory will be visible to other processes
/// (they may choose to open it).
uint64 presentable : 1; ///< Memory can be bound to an image that will be used by presents.
uint64 flippable : 1; ///< Memory can be bound to an image that will be used by flip
/// presents.
uint64 stereo : 1; ///< Memory will be used for stereo (DXGI or AQBS stereo).
uint64 globallyCoherent : 1; ///< Memory needs to be globally coherent,
/// indicating the driver must manage both
/// CPU caches and GPU caches that are not flushed on
/// command buffer boundaries.
uint64 xdmaBuffer : 1; ///< GPU memory will be used for an XDMA cache buffer for
/// transferring data
/// between GPUs in a multi-GPU configuration.
uint64 turboSyncSurface : 1; ///< The memory will be used for TurboSync private swapchain primary.
uint64 typedBuffer : 1; ///< GPU memory will be permanently considered a single
/// typed buffer pseudo-object
/// with the properties given in typedBufferInfo.
uint64 globalGpuVa : 1; ///< The GPU virtual address must be visible to all devices.
uint64 useReservedGpuVa : 1; ///< Use GPU virtual address previously reserved by another
/// memory object. It is invalid when using the shadow descriptor
/// table VA range.
uint64 autoPriority : 1; ///< Allow the platform to automatically determine the priority of
/// this GPU memory allocation. Flag is only valid if the device
/// reports that it supports this feature, and will result in an
/// error otherwise.
uint64 busAddressable : 1; ///< Create Bus Addressable memory. Allow memory to be used by other
/// device on the PCIe bus by exposing a write-only bus address.
uint64 sdiExternal : 1; ///< Create External Physical memory from an already allocated memory
/// on remote device. Similar to virtual allocations (no physical
/// backing) but have an immutable page mapping. The client must
/// specify surfaceBusAddr and markerBusAddr either at creation time
/// in GpuMemoryCreateInfo or by calling SetSdiRemoteBusAddress
/// once before using the GPU memory. The page mappings for an
/// allocation with this flag set must be initialized by including a
/// reference to it in the ppExternPhysMem list for the first
/// submission that references it.
uint64 sharedViaNtHandle : 1; ///< Memory will be shared by using Nt handle.
uint64 peerWritable : 1; ///< The memory can be open as peer memory and be writable.
uint64 tmzProtected : 1; ///< The memory is protected using TMZ (Trusted Memory Zone) or HSFB
/// (Hybrid Secure Framebuffer). It is not CPU accessible,
/// and GPU access is restricted by the hardware such that data
/// cannot be copied from protected memory into unprotected memory.
uint64 placeholder0 : 1; ///< Placeholder.
uint64 externalOpened : 1; ///< Specifies the GPUMemory is opened.
uint64 restrictedContent : 1; ///< Specifies the GPUMemory is protected content.
uint64 restrictedAccess : 1; ///< Specifies the GPUMemory is restricted shared access resource.
uint64 crossAdapter : 1; ///< Specifies the GPUMemory is shared cross-adapter resource.
uint64 cpuInvisible : 1; ///< By default, PAL makes every allocation CPU visible if all of its
/// preferred
///< heaps are CPU visible. This flag can be used to override this
/// behavior when the client knows the memory will never be mapped
/// for CPU access. If this flag is set, calls to IGpuMemory::Map()
/// on this object will fail.
uint64 gl2Uncached : 1; ///< Specifies the GPU Memory is un-cached on GPU L2 cache.
/// But the memory still would be cached by other cache hierarchy
/// like L0, RB caches, L1, and L3.
uint64 mallRangeActive : 1; ///< If set, then this allocation will be partially allocated in the
/// MALL. If this is set, then the mallPolicy enumeration must be set
/// to either "always" or "never".
uint64 explicitSync : 1; ///< If set, shared memory will skip syncs in the kernel and all
/// drivers that use this memory must handle syncs explicitly.
uint64 privPrimary : 1; ///< This is a private primary surface gpu memory.
uint64 privateScreen : 1; ///< GPU memory will be used for a private screen image.
uint64 kmdShareUmdSysMem : 1; ///< UMD will allocate/free a memory buffer to be shared with KMD.
uint64 deferCpuVaReservation : 1; ///< KMD will allocate with the "CpuVisibleOnDemand" alloc flag.
/// Ignored for non-CPU-visible allocations.
uint64 placeholder1 : 1;
uint64 startVaHintFlag : 1; ///< startVaHintFlag is set to 1 for passing startVaHint address
/// to set baseVirtAddr as startVaHint for memory allocation.
#if PAL_AMDGPU_BUILD
uint64 initializeToZero : 1; ///< If set, PAL will request that the host OS zero-initializes
/// the allocation upon creation, currently, only GpuHeapLocal and
/// GpuHeapInvisible are supported.
uint64 discardable : 1; ///< If set, this gpu memory object can be discarded under memory
/// pressure without keeping the content.
#else
uint64 placeholder2 : 2;
#endif
uint64 directCaptureSource : 1; ///< Memory will be mapped to DirectCapture resource's KMD-managed
/// private VA.
uint64 reserved : 28; ///< Reserved for future use.
};
uint64 u64All; ///< Flags packed as 64-bit uint.
};
/// Specifies properties of a typed buffer pseudo-object. When this is specified in GpuMemoryCreateInfo along with the
/// typedBuffer flag, the GPU memory object has been permanently cast as a single typed buffer. A typed buffer is very
/// similar to a linear 3D image: it has a format, extent, and row/depth pitch values.
///
/// Note that the typed buffer concept is used in other parts of the PAL interface and some of those instances may not
/// require a permanent typed buffer association. In such cases multiple typed buffers can be "bound" to one GPU memory
/// object at arbitrary offsets without any need to set the typedBuffer flag or fill out a TypedBufferCreateInfo.
struct TypedBufferCreateInfo
{
SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle.
Extent3d extent; ///< Dimensions in pixels WxHxD.
uint32 rowPitch; ///< Offset in bytes between the same X position on two consecutive lines.
uint32 depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices.
bool depthIsSubres; ///< True if the depth slices should be treated as an array of 2D subresources.
};
/// Specifies properties for @ref IGpuMemory creation. Input structure to IDevice::CreateGpuMemory().
///
/// See the @ref IGpuMemory for additional restrictions on the size, alignment, vaRange, and descrVirtAddr fields.
struct GpuMemoryCreateInfo
{
GpuMemoryCreateFlags flags; ///< GPU memory flags.
gpusize size; ///< Amount of GPU memory to allocate in bytes.
gpusize alignment; ///< Byte alignment of the allocation's GPU VA. If zero, an alignment matching the
/// allocation granularity will be used.
VaRange vaRange; ///< Virtual address range for the GPU memory allocation.
union
{
const IGpuMemory* pReservedGpuVaOwner; ///< Must be zero unless "useReservedGpuVa" is true. It points to the
/// memory object which previously reserved the GPU VA range to be used
/// by the new memory object.
gpusize descrVirtAddr; ///< Must be zero unless vaRange is ShadowDescriptorTable, in which case
/// it must specify the GPU VA of the corresponding DescriptorTable.
/// It doesn't need to be the base VA of the DescriptorTable allocation
/// but must be aligned to "alignment".
gpusize replayVirtAddr; ///< Must be zero unless vRange is CaptureReplay, in which case it must
/// specify the GPU VA of the corresponding memory object.
gpusize startVaHint; ///< Client passes a start VA hint to set as baseVirtAddr. If the given
/// hint is not properly aligned, find next higher aligned address as
/// hint. If the hint is available and within right vaRange where
/// vaRange is VaRange::Default then set baseVirtAddr as hint. If the
/// hint is unavailable, find the higher available address between
/// startVaHint and max vaRange. If any of the two cases are failed,
/// set baseVirtAddr as normal.
};
GpuMemPriority priority; ///< Hint to the OS paging process on how important it is to keep this
/// allocation in its preferred heap.
GpuMemPriorityOffset priorityOffset; ///< Offset from the base level priority. A higher offset means higher priority
/// within same base Level. Currently supported on Windows only.
GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be accessed via the
/// MALL (memory access last level). Only valid if "supportsMall" is set in
/// DeviceProperties.
GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive is set.
/// Any pages outside of this range will use the opposite MALL policy from
/// what is specified in "mallPolicy".
/// Describes how the allocation will be accessed. If heapAccess is set to something other than @ref
/// GpuHeapAccessExplicit, then PAL decides the appropriate heap to allocate memory from based on this member and
/// @ref heaps is ignored. Otherwise heap selection respects the selection in @ref heaps.
GpuHeapAccess heapAccess;
uint32 heapCount; ///< Number of entries in heaps[]. Must be 0 for virtual allocations.
GpuHeap heaps[GpuHeapCount]; ///< List of allowed memory heaps, in order of preference. It will be ignored if
/// @ref heapAccess is to something other than @ref GpuHeapAccessExplicit.
/// The pointer to an Image object the memory object will be bound to. It must be specified in special cases where
/// a memory object is permanently linked to an Image such as presentable images or shared resources on Windows.
/// For other cases, it's highly encouraged to provide the image object pointer.
///
/// When @ref compression is set to Default, clients can benefit from PAL's internal logic to choose the optimal
/// GFX12-style distributed compression setting (enabled or disabled) based on properties of this image (e.g., usage
/// flags); otherwise PAL will disable the compression by default when detecting nullptr @pImage.
IImage* pImage;
TypedBufferCreateInfo typedBufferInfo; ///< This struct must be filled out if the @tref typedBuffer flag is set.
/// This GPU memory will be permanently considered a typed buffer.
VirtualGpuMemAccessMode virtualAccessMode; ///< Access mode for virtual GPU memory's unmapped pages, WDDM only.
gpusize surfaceBusAddr; ///< Surface bus address of Bus Addresable Memory.
/// Only valid when GpuMemoryCreateFlags::sdiExternal is set.
gpusize markerBusAddr; ///< Marker bus address of Bus Addresable Memory. The client can:
/// 1. Write to marker
/// 2. Let GPU wait until a value is written to marker before issuing
/// the next command.
/// Only valid when GpuMemoryCreateFlags::sdiExternal is set.
/// Client override for GFX12-style distributed compression. Only meaningful on devices that set the
/// supportDistributedCompression DeviceProperties flag. By default, PAL will apply a heuristic to determine
/// whether or not to enable compression based on properties of the memory allocation (e.g., CPU-visibility or
/// properties of the attached pImage). This mode does not apply to virtual IGpuMemory objects (such object
/// inherit their compression behavior from their backing physical memory on a page-by-page basis).
///
/// @note If allocating memory intended to back multiple resources (e.g., a heap for DX12 "placed resources") where
/// pImage is nullptr, the client should set compression to Enable then rely on per-resource or per-view controls to
/// disable compression for resources as needed (use compressionMode fields in ImageCreateInfo, BufferViewInfo,
/// ImageViewInfo, etc.).
///
/// If clients intends to enable distributed compression on buffers, buffer base alignment and size should be
/// aligned to DCC minimum compression unit size (256 bytes); otherwise there is potential corruption issue.
/// e.g. An allocation holding two buffers (placed resource or driver suballocation). Suppose the first buffer has
/// size 4Kibytes+128 bytes with compression enabled and the second buffer has size 128bytes with compression
/// disabled (buffer view compressionMode is ReadBypassWriteDisable). When compressing the trailing 128bytes of
/// the first buffer, it will compress the second buffer as well. Reading the second buffer will result in corrupted
/// content.
TriState compression;
};
/// Specifies properties for @ref IGpuMemory creation. Input structure to IDevice::CreatePinnedGpuMemory().
///
/// See the @ref IGpuMemory for additional restrictions on the size and vaRange fields.
struct PinnedGpuMemoryCreateInfo
{
const void* pSysMem; ///< Pointer to the system memory that should be pinned for GPU access. Must be
/// aligned to realMemAllocGranularity in DeviceProperties.
size_t size; ///< Amount of system memory to pin for GPU access.
VaRange vaRange; ///< Virtual address range for the GPU memory allocation.
gpusize alignment; ///< Byte alignment of the allocation's GPU VA. If zero, an alignment matching the
/// Platform's allocation granularity will be used.
GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be
/// accessed via the MALL (memory access last level). Only valid
/// if "supportsMall" is set in DeviceProperties.
GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive
/// is set. Any pages outside of this range will use the opposite
/// MALL policy from what is specified in "mallPolicy".
};
/// Specifies properties for @ref IGpuMemory creation. Input structure to IDevice::CreateSvmGpuMemory().
///
/// See the @ref IGpuMemory for additional restrictions on the size and alignment.
struct SvmGpuMemoryCreateInfo
{
GpuMemoryCreateFlags flags; ///< GPU memory flags.
gpusize size; ///< Amount of SVM memory to allocate in bytes.
/// The total amount of SVM memory can't exceed the value set in
/// maxSvmSize when the platform is created.
gpusize alignment; ///< Byte alignment of the allocation's SVM VA. If zero, an
/// alignment matching the allocation granularity will be used.
const IGpuMemory* pReservedGpuVaOwner; ///< Must be zero unless "useReservedGpuVa" is true. It points to
/// the memory object which previously reserved the GPU VA range
/// to be used by the new memory object.
bool isUsedForKernel; ///< Memory will be used to store kernel and execute on gpu.
GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be
/// accessed via the MALL (memory access last level). Only valid
/// if "supportsMall" is set in DeviceProperties.
GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive
/// is set. Any pages outside of this range will use the opposite
/// MALL policy from what is specified in "mallPolicy".
};
/// Specifies parameters for opening a shared GPU memory object on another device.
struct GpuMemoryOpenInfo
{
IGpuMemory* pSharedMem; ///< Shared GPU memory object from another device to open.
};
/// Specifies parameters for opening a GPU memory object on another device for peer-to-peer memory transfers.
struct PeerGpuMemoryOpenInfo
{
IGpuMemory* pOriginalMem; ///< GPU memory object from another device to open for peer-to-peer memory transfers.
};
/// Specifies parameters for opening another non-PAL device's gpu memory for access from this device. Input structure to
/// IDevice::OpenExternalSharedGpuMemory().
struct ExternalGpuMemoryOpenInfo
{
ExternalResourceOpenInfo resourceInfo; ///< Information describing the external gpuMemory.
TypedBufferCreateInfo typedBufferInfo; ///< Information describing the typed buffer information.
GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be accessed via
/// the MALL (memory access last level). Only valid if "supportsMall" is
/// set in DeviceProperties.
GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive is set.
/// Any pages outside of this range will use the opposite MALL policy
/// from what is specified in "mallPolicy".
union
{
struct
{
uint32 typedBuffer : 1; ///< GPU memory will be permanently considered a single typed buffer pseudo-object
/// with the properties given in typedBufferInfo.
uint32 gl2Uncached : 1; ///< Specifies the GPU Memory is un-cached on GPU L2 cache.
uint32 mallRangeActive : 1; ///< If set, then this allocation will be partially allocated in the MALL.
/// If this is set, then the mallPolicy enumeration must be set to either
/// "always" or "never".
uint32 reserved : 29; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< External Gpu memory open info flags.
};
/// The fundemental information that describes a GPU memory object that is stored directly in each IGpuMemory.
/// It can be accessed without a virtual call via IGpuMemory::Desc().
struct GpuMemoryDesc
{
gpusize gpuVirtAddr; ///< GPU virtual address of the GPU memory allocation.
gpusize size; ///< Size of the GPU memory allocation, in bytes.
gpusize clientSize; ///< Size of the client requested GPU memory allocation, in bytes.
gpusize alignment; ///< Required GPU virtual address alignment, in bytes.
uint32 heapCount; ///< Number of entries in heaps[]. Must be 0 for virtual allocations.
GpuHeap heaps[GpuHeapCount]; ///< List of preferred memory heaps, in order of preference.
gpusize surfaceBusAddr; ///< Bus Address of SDI memory surface and marker. These will not be initialized
gpusize markerBusAddr; /// until the memory is made resident. Client needs to call
/// InitBusAddressableGpuMemory() to query and update before this is valid.
union
{
struct
{
uint32 isVirtual : 1; ///< GPU memory is not backed by physical memory and must be remapped before the
/// GPU can safely access it. Will also be set for sdiExternal allocations. See
/// GpuMemoryCreateFlags::sdiExternal
uint32 isPeer : 1; ///< GPU memory object was created with @ref IDevice::OpenPeerGpuMemory.
uint32 isShared : 1; ///< GPU memory object was created either with
/// @ref IDevice::OpenExternalSharedGpuMemory or OpenSharedGpuMemory.
/// This IGpuMemory references memory created either by another process or
/// another device with the exception of peer access.
uint32 isExternal : 1; ///< GPU memory object was created with @ref IDevice::OpenExternalSharedGpuMemory.
/// This IGpuMemory references memory that was created either by another process
/// or by a device that doesn't support sharedMemory with this object's device
/// (i.e., MDA sharing on Windows).
uint32 isSvmAlloc : 1; ///< GPU memory is allocated in system memory.
/// Valid only when IOMMUv2 is supported
uint32 isExecutable : 1; ///< GPU memory is used for execution. Valid only when IOMMUv2 is supported
uint32 isExternPhys : 1; ///< GPU memory is External Physical memory
uint32 placeholder0 : 1; ///< Reserved for future memory flag
uint32 isCompressed : 1; ///< Set for physical allocations where UMD requested PTE.D=1 to enable
/// GFX12-style distributed compression.
uint32 reserved : 23; ///< Reserved for future use
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< GPU memory desc flags.
uint64 uniqueId; ///< Unique ID given to each GPU memory object, allows client tracking of GPU memory allocations.
};
/// Defines GPU memory sub allocation info. Contains a GPU memory handle to the whole memory. And the offset and size
/// shows where is the sub allocated memory.
struct GpuMemSubAllocInfo
{
gpusize address; ///< Start address of the memory, not including the offset.
gpusize offset; ///< Offset from the start address of the memory.
gpusize size; ///< Size of the memory.
};
/// Specifies a GPU memory object and flags with more specific usage details. An array of these structures is specified
/// to PAL residency operations.
///
/// @see IDevice::AddGpuMemoryReferences
/// @see IQueue::Submit
struct GpuMemoryRef
{
union
{
struct
{
uint32 readOnly : 1; ///< The allocation will not be written using this reference.
uint32 reserved : 31; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< GPU memory reference flags.
IGpuMemory* pGpuMemory; ///< The GPU memory object referenced by this residency operation.
};
/// Specifies a Display Output Post-Processing (DOPP) allocation that will be referenced by a submission along with
/// additional info describing how it will be used.
///
/// @see IQueue::Submit
struct DoppRef
{
union
{
struct
{
uint32 pfpa : 1; ///< Access to this DOPP allocation will be redirected to the primary pending
/// present (i.e., pre-flip primary access). If not set, access will
/// refer to the current onscreen primary.
uint32 lastPfpaCmd : 1; ///< This submission will be the last access of this pfpa allocation
/// for this frame. The pfpa interval will end once this submit
/// completes, allowing the corresponding vidPnSource to flip.
/// This flag is invalid if the pfpa flag is not set.
uint32 reserved : 30; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< GPU memory reference flags.
IGpuMemory* pGpuMemory; ///< The GPU memory object referenced by this residency operation.
};
/// Specifies the types of the exporting memory.
enum class ExportHandleType : uint32
{
Default = 0, ///< Let PAL choose the export type
#if PAL_AMDGPU_BUILD
FileDescriptor, ///< Export using a Linux file descriptor
Kms, ///< Export through KMS
#endif
};
/// Specifies parameters for export a GPUMemory NT handle from its name.
struct GpuMemoryExportInfo
{
#if PAL_KMT_BUILD
const _SECURITY_ATTRIBUTES* pSecurityAttributes; ///< It specifies the security descriptor and the inheritable
/// attribute.
const wchar_t* pNtObjectName; ///< A name to NT handle, if the object is exported as a NT
/// handle with a name, and then the handle can be acquired
/// via this name.
uint32 accessFlags; ///< Desried access rights of GPU memory.
#endif
ExportHandleType exportType; ///< Type of handle to use for exporting the memory.
};
/**
***********************************************************************************************************************
* @interface IGpuMemory
* @brief Interface representing a GPU-accessible memory allocation.
*
* Depending on creation parameters, this could correspond to:
*
* + A _real_ memory object, corresponding directly to a physical allocation made on this device (whether it resides in
* a local or non-local heap).
* + A _virtual_ memory object, only consisting of virtual address space that can be mapped on a page basis to pages in
* _real_ memory objects via IQueue::RemapVirtualMemoryPages.
* + Pinned memory, a _real_ memory object created by pinning down client system memory.
* + Peer memory, a _real_ memory object corresponding to GPU memory that is likely local to another GPU. Only copy
* operations (peer-to-peer transfers) are allowed with this memory.
* + Opened/shared memory, a _real_ memory object that is fully shared between multiple GPUs, residing in a non-local
* heap.
* + External shared memory, a _real_ memory object that was created by an external process and is fully shared between
* multiple GPUs.
*
* @see IDevice::CreateGpuMemory
* @see IDevice::CreatePinnedGpuMemory
* @see IDevice::OpenSharedGpuMemory
* @see IDevice::OpenPeerGpuMemory
* @see IDevice::OpenExternalSharedGpuMemory
*
*
* All of these kinds of GPU memory are assigned a set of fundemental properties specified in GpuMemoryDesc which are
* either specified by the client or by PAL. There are specific rules these properties must follow; those rules are
* documented here to avoid duplication. Violating these rules will cause the device's corresponding "get size"
* functions to return an error code, the create/open functions may not validate their arguments.
*
*
* With the exception of external memory objects being opened, PAL will adjust size and base alignments as necessary
* to meet device requirements. Typically this means going out to OS page boundaries. The client is no longer required
* to query device requirements and align for PAL.
*
* Note that the device alignment requirements apply equally to GPU VAs. However, other kinds of alignment
* restrictions (e.g., IGpuMemoryBindable's requirements) may only apply to one of those two properties. When creating
* GPU memory objects the client must be careful to set the "alignment" field to the alignment of the GPU VA.
*
*
* Second, the client can't directly specify a memory object's GPU VA but must specify its VA range, limiting which
* portions of the VA space can be used. Note that non-external shared and peer GPU memory objects will use the
* original memory's VA range. External shared GPU memory always uses the default VA range.
*
* The ShadowDescriptorTable VA range is special because it pairs the shadow GPU memory to an existing descriptor GPU
* memory. The client must specify the GPU VA of the corresponding DescriptorTable memory when creating a shadow GPU
* memory object via descrVirtAddr; it must satisfy the alignment requirements of the shadow GPU memory. Both GPU
* memory objects must be created on the same device. Note that descrVirtAddr can be offset into the descriptor
* allocation such that multiple shadow GPU memory objects correspond to one larger descriptor GPU memory object.
*
*
* The client can further influence the GPU VA of shared and peer GPU memory objects. If the globalGpuVa flag is set
* when the original GPU memory object is created, PAL will assign any shared or peer GPU memory objects that same VA.
* Note that globalGpuVa is only supported if globalGpuVaSupport is set in DeviceProperties.
***********************************************************************************************************************
*/
class IGpuMemory : public IDestroyable
{
public:
/// Sets a new priority for this GPU memory object.
///
/// This call is not available for virtual or pinned memory.
///
/// @param [in] priority New base priority for the GPU memory object.
///
/// @param [in] priorityOffset New priority offset for the GPU memory object. This is a small bias that can be
/// used by the OS to raise the importance of an allocation when there are
/// multiple allocations in the same base priority level. You can think of it as
/// the fractional bits of the priority level.
///
/// @returns Success if the priority was successfully updated. Otherwise, one of the following errors may be
/// returned:
/// + ErrorUnavailable if this is a virtual or pinned GPU memory object.
virtual Result SetPriority(
GpuMemPriority priority,
GpuMemPriorityOffset priorityOffset) = 0;
/// Makes the GPU memory available for CPU access and gives the client a pointer to reference it.
///
/// The allocation should be unmapped by the client once CPU access is complete, although it _is_ legal to keep an
/// allocation mapped while the GPU references the allocation from a command buffer.
///
/// It is legal to map the allocation multiple times concurrently. Mapping is not available for pinned or virtual
/// memory objects. This call is thread safe for calls referencing this memory object.
///
/// @see Unmap.
///
/// @param [out] ppData CPU pointer to the GPU memory object.
///
/// @returns Success if the map succeeded. Otherwise, *ppData will not be valid and one of the following errors may
/// be returned.
/// + ErrorInvalidPointer if ppData is null.
/// + ErrorGpuMemoryMapFailed if the object is busy and cannot be mapped by the OS.
/// + ErrorNotMappable if the memory object cannot be mapped due to some of its heaps not having the CPU
/// visible flag set.
/// + ErrorUnavailable if the memory object is not a real allocation.
virtual Result Map(
void** ppData) = 0;
/// Removes CPU access from a previously mapped GPU memory object.
///
/// This call is thread safe for calls referencing the same memory object.
///
/// @see Map
///
/// @returns Success if the unmap succeeded. Otherwise, one of the following errors may be returned:
/// + ErrorGpuMemoryUnmapFailed if the GPU memory object cannot be unlocked.
/// + ErrorUnavailable if the GPU memory object is not a real allocation.
virtual Result Unmap() = 0;
#if PAL_KMT_BUILD || PAL_AMDGPU_BUILD
/// Returns an OS-specific handle which can be used to refer to this GPU memory object across processes. This will
/// return a null or invalid handle if the object was not created with the @ref interprocess create flag set.
///
/// @note This function is only available for Linux builds or KMT builds.
///
/// @param [in] handleInfo The info is used to open handle.
///
/// @returns An OS-specific handle which can be used to access the GPU memory object across processes.
virtual OsExternalHandle ExportExternalHandle(const GpuMemoryExportInfo& exportInfo) const = 0;
#endif
/// Returns a structure containing some fundemental information that describes this GPU memory object.
///
/// @returns A reference to this allocation's GpuMemoryDesc.
const GpuMemoryDesc& Desc() const { return m_desc; }
/// Returns the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @returns Pointer to client data.
void* GetClientData() const
{
return m_pClientData;
}
/// Sets the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @param [in] pClientData A pointer to arbitrary client data.
void SetClientData(
void* pClientData)
{
m_pClientData = pClientData;
}
/// Set SDI remote surface bus address and marker bus address.
///
/// This GPU memory object must have been created with the sdiExternal flag set and with the GpuMemoryCreateInfo
/// surfaceBusAddr and markerBusAddr fields both set to zero. This function allows clients to defer setting those
/// addresses until after creation. It must be called exactly once to permanently bind the given SDI addresses to
/// this GPU memory object.
///
/// @warning An sdiExternal GPU memory object is not complete until its given its SDI addresses! The gpuVirtAddr
/// field in this GPU memory's GpuMemoryDesc will not be valid until this function is called!
///
/// @param [in] surfaceBusAddr Surface bus address of Bus Addressable Memory.
/// @param [in] markerBusAddr Marker bus address of Bus Addressable Memory. The client can write to the marker
/// and have the GPU wait until a value is written to marker before continuing.
///
/// @returns Success if succeeded. Otherwise, one of the following errors may be returned:
/// + ErrorUnavailable if the GPU memory object is not external physical memory or it has already been set.
/// + ErrorInvalidValue if one of the input params is 0.
/// + One of the escape call failed error.
virtual Result SetSdiRemoteBusAddress(gpusize surfaceBusAddr, gpusize markerBusAddr) = 0;
protected:
/// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly
/// called the proper create method.
IGpuMemory() : m_pClientData(nullptr) {}
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~IGpuMemory() { }
GpuMemoryDesc m_desc; ///< Information that describes this GPU memory object.
private:
/// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
/// and set via SetClientData().
/// For non-top-layer objects, this will point to the layer above the current object.
void* m_pClientData;
};
} // Pal
@@ -0,0 +1,139 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palGpuMemoryBindable.h
* @brief Defines the Platform Abstraction Library (PAL) IGpuMemoryBindable interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palDestroyable.h"
namespace Pal
{
// Forward declarations.
class IGpuMemory;
/// Reports required properties of a GPU memory object bound to a specific object. The client must query these
/// properties via IGpuMemoryBindable::GetGpuMemoryRequirements() and bind an @ref IGpuMemory object matching these
/// requirements to the @ref IGpuMemoryBindable object using IGpuMemoryBindable::BindGpuMemory().
struct GpuMemoryRequirements
{
union
{
struct
{
uint32 cpuAccess : 1; ///< CPU access is required. If set, the client must not set cpuInvisible in
/// GpuMemoryCreateFlags and must provide CPU visible heaps or CPU visible heap
/// access mode. If not set, it's strongly recommended to set cpuInvisible.
uint32 reserved : 31; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< Flags specifying required GPU memory properties.
gpusize size; ///< Amount of GPU memory required, in bytes.
gpusize alignment; ///< Required GPU memory virtual address alignment, in bytes.
uint32 heapCount; ///< Number of valid entries in heaps[].
GpuHeap heaps[GpuHeapCount]; ///< List of allowed heaps for the GPU memory in order of predicted performance.
};
/**
***********************************************************************************************************************
* @interface IGpuMemoryBindable
* @brief Interface inherited by objects that may require GPU memory be bound to them.
*
* In the future, PAL may discover a need to allocate GPU memory for a class that currently doesn't require it. In that
* situation, that class will be updated to inherit from IGpuMemoryBindable. This change would break backward
* compatibility and would result in the major interface version being incremented.
***********************************************************************************************************************
*/
class IGpuMemoryBindable : public IDestroyable
{
public:
/// Queries the GPU memory properties required by this object. The client should query properties with this method,
/// create/sub-allocate a memory range matching the requirements, then bind the memory to the object via
/// @ref BindGpuMemory().
///
/// @note Not all objects may actually need GPU memory, and in that case the memory properties will reflect a 0 size
/// and alignment.
///
/// @param [out] pGpuMemReqs Required properties of GPU memory to be bound to this object. Includes properties like
/// size, alignment, and allowed heaps.
virtual void GetGpuMemoryRequirements(
GpuMemoryRequirements* pGpuMemReqs) const = 0;
/// Binds GPU memory to this object according to the requirements queried via GetGpuMemoryRequirements().
///
/// Binding memory to objects other than images automatically initializes the object memory as necessary. Image
/// objects used as color or depth-stencil targets have to be explicitly initialized in command buffers using a
/// ICmdBuffer::CmdReleaseThenAcquire() command to transition them out of the LayoutUninitializedTarget usage.
///
/// Binding memory to an object automatically unbinds any previously bound memory. There is no need to bind null to
/// an object to explicitly unbind a previously bound allocation before binding a new allocation.
///
/// This call is invalid on objects that have no memory requirements, even if binding null.
///
/// @param [in] pGpuMemory GPU memory to be bound. If null, the previous binding will be released.
/// @param [in] offset Offset into the GPU memory where the object's memory range should begin. This allows
/// sub-allocating many object's GPU memory from the same IGpuMemory object.
///
/// @returns Success if the specified GPU memory was successfully bound to the object. Otherwise, one of the
/// following errors may be returned:
/// + ErrorUnavailable if binding a non-image to a virtual allocation.
/// + ErrorInvalidAlignment if the offset does not match the alignment requirements of the object.
/// + ErrorInvalidMemorySize if the object's required memory size does not fit completely within the given
/// memory object at the specified offset.
virtual Result BindGpuMemory(
IGpuMemory* pGpuMemory,
gpusize offset) = 0;
/// Returns the GPU memory object and offset that this object is bound to or nullptr and 0 if not bound.
///
/// @param [out] ppGpuMemory Returns the GPU memory object to the address specified in this pointer.
/// Returns nullptr if this object is not bound to any GPU memory.
/// @param [out] pOffset Returns the GPU memory offset to the address specified in this pointer.
/// Returns 0 if this object is not bound to any GPU memory.
///
/// @returns Success if the GPU memory and offset was successfully returned. Otherwise, one of the following errors
/// may be returned:
/// + ErrorGpuMemoryNotBound if this object is not bound to any GPU memory.
/// + ErrorInvalidPointer if either ppGpuMemory or pOffset is nullptr.
/// + ErrorUnavailable if binding is not supported in the derived class
virtual Result GetGpuMemory(
IGpuMemory** ppGpuMemory,
gpusize* pOffset) const = 0;
protected:
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~IGpuMemoryBindable() { }
};
} // Pal
@@ -0,0 +1,983 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palImage.h
* @brief Defines the Platform Abstraction Library (PAL) IImage interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palGpuMemoryBindable.h"
namespace Pal
{
// Forward declarations.
class IImage;
class IPrivateScreen;
class IScreen;
class ISwapChain;
enum class CompressionMode : uint32;
enum class ClientCompressionMode : uint32;
enum SwizzleMode : uint32;
struct ImageCopyRegion;
/// When used as the value of the viewFormatCount parameter of image creation it indicates that all compatible formats
/// can be used for views of the created image.
constexpr uint32 AllCompatibleFormats = UINT32_MAX;
/// Specifies dimensionality of an image (i.e., 1D, 2D, or 3D).
enum class ImageType : uint32
{
Tex1d = 0x0,
Tex2d = 0x1,
Tex3d = 0x2,
Count
};
/// Specifies the tiling (address swizzling) to use for an image. When a linear tiled image is mapped its contents will
/// be laid out in row-major ordering. All other tiling modes require the use of swizzles equation to locate texels.
enum class ImageTiling : uint32
{
Linear = 0x0, ///< Image is laid out in scan-line (row-major) order.
Optimal = 0x1, ///< Image is laid out in a GPU-optimal order.
Standard64Kb = 0x2, ///< Image is laid out in the cross-IHV, 64KB, standard swizzle tiling.
Count
};
/// Hints to pal to identify a preference for how this image is organized. This is a preference setting, and may be
/// ignored if pal believes better options exist.
enum class ImageTilingPattern : uint32
{
Default = 0x0, ///< No swizzle mode is preferred.
Standard = 0x1, ///< Prefer standard swizzle modes.
XMajor = 0x2, ///< Prefer x-coordinate major swizzle modes.
YMajor = 0x3, ///< Prefer y-coordinate major swizzle modes.
Interleaved = 0x4, ///< Prefer interleaved coordinate swizzle modes.
Count
};
/// Hints to pal to select the appropriate tiling mode for a optimization target.
enum class TilingOptMode : uint32
{
Balanced = 0x0, ///< Balance memory foorprint and rendering performance.
OptForSpace = 0x1, ///< Optimize tiling mode for saving memory footprint
OptForSpeed = 0x2, ///< Optimize tiling mode for rendering performance.
Count
};
/// Image metadata modes.
enum class MetadataMode : uint16
{
Default = 0, ///< Default behavior. PAL chooses if metadata should be present or not.
ForceEnabled, ///< Optimization Hint: The client would prefer Metadata if possible. Useful for scenarios where
/// metadata isn't an obvious win and clients can enable based on some hueristic or app-detect.
Disabled, ///< The Image will not contain any compression metadata.
FmaskOnly, ///< The color msaa Image will only contain Cmask/Fmask metadata; this mode is only valid for color
/// msaa Image. On GPUs with GFX12-style distributed compression (see supportDistributedCompression
/// flag in @ref DeviceProperties), metadataMode only controls UMD metadata (Hi-Z and Hi-S). On such
/// GPUs, the FmaskOnly enum is ignored and treated like Default.
Count,
};
/// Image metadata TC compat modes.
enum class MetadataTcCompatMode : uint16
{
Default = 0, ///< Default behavior. PAL chooses if TC compat should be enabled (if compressed).
ForceEnabled, ///< Optimization Hint: Tells PAL that the client would prefer Metadata is TC compat.
Disabled, ///< Optimization Hint: Tells PAL that the client would prefer Metadata is not TC compat.
Count,
};
/// Image shared metadata support level
enum class MetadataSharingLevel : uint32
{
FullExpand = 0, ///< The metadata need to be fully expanded at ownership transition time.
ReadOnly = 1, ///< The metadata are expected to have read-only usage after the ownership is transitioned.
FullOptimal = 2, ///< The metadata can remain as-is if possible at ownership transition time.
};
/// Specifies the type of PRT map image being created.
enum class PrtMapType : uint32
{
None = 0, ///< This is not an auxillary image used for PRT plus functionality.
Residency = 1, ///< Image data is really a low-resolution map containing the finest populated LOD
/// for a particular UV space region.
SamplingStatus = 2, ///< Indicates the validity of a given tile on a per-mip level basis.
Count,
};
/// Specifies how to interpret a clear color.
enum class ClearColorType : uint32
{
Uint = 0, ///< The color is stored as an unsigned integer in RGBA order in u32Color. It will be swizzled and
/// compacted before it is written to memory.
Sint = 1, ///< The color is stored as a signed integer in RGBA order in i32Color. It will be swizzled and
/// compacted before it is written to memory.
Float = 2, ///< The color is stored as floating point in RGBA order. It will be swizzled and converted to the
/// appropriate numeric format before it is written to memory.
Yuv = 3, ///< The color is stored as an unsigned integer in YUVA order in u32Color. It will be swizzled and
/// compacted before it is written to memory. The client must clamp the clear color within the
/// valid range, e.g. [0, 255] for 8-bit.
Count
};
/// Contains everything necessary to store and interpret a clear color.
struct ClearColor
{
ClearColorType type; ///< How to interpret this clear color.
uint8 disabledChannelMask; ///< This 4 bits are used to selectively disable the A,B,G,R channels
/// from being written. 0 means write ABRG. 0xF means write nothing.
/// 0x8 means write Blue, Green, Red. 0x7 means write Alpha. etc...
union
{
uint32 u32Color[4]; ///< The clear color, interpreted as four unsigned integers.
float f32Color[4]; ///< The clear color, interpreted as four floating point values.
};
};
/// Specifies a set of image creation flags.
union ImageCreateFlags
{
struct
{
uint32 invariant : 1; ///< Images with this flag set and all other creation identical are
/// guaranteed to have a consistent data layout.
uint32 cloneable : 1; ///< Image is valid as a source or destination of a clone operation.
/// See @ref IDevice::ImagePrefersCloneCopy() for more details.
uint32 shareable : 1; ///< Image can be shared between compatible devices.
uint32 presentable : 1; ///< Indicates this image can be used in presents.
uint32 flippable : 1; ///< Image can be used for flip presents.
uint32 stereo : 1; ///< Indicates AMD quad buffer stereo extension (AQBS extension) image
uint32 dxgiStereo : 1; ///< Indicates DXGI stereo (Win8 stereo) image
uint32 cubemap : 1; ///< Image will be used as a cubemap.
uint32 prt : 1; ///< Image is a partially resident texture (aka, sparse image or tiled
/// resource)
uint32 needSwizzleEqs : 1; ///< Image requires valid swizzle equations.
uint32 perSubresInit : 1; ///< The image may have its subresources initialized independently using
/// barrier calls out of the uninitialized layout.
uint32 separateDepthPlaneInit : 1; ///< If set, the caller may transition the stencil and depth planes from
/// "Uninitialized" state at any time. Otherwise, both planes must be
/// transitioned in the same barrier call. Only meaningful if
/// "perSubresInit" is set.
uint32 repetitiveResolve : 1; ///< Optimization: Is this image resolved multiple times to an image which
/// is mostly similar to this image?
uint32 preferSwizzleEqs : 1; ///< Image prefers valid swizzle equations, but an invalid swizzle
/// equation is also acceptable.
uint32 fixedTileSwizzle : 1; ///< Fix this image's tile swizzle to ImageCreateInfo::tileSwizzle. This
/// is only supported for single-sampled color images.
uint32 videoReferenceOnly : 1; ///< Image is used by video hardware for reference buffer only.
/// It uses a different tiling format than the decoder output buffer.
uint32 optimalShareable : 1; ///< Indicates metadata information is to be added into private data on
/// creation time and honored on open time.
uint32 sampleLocsAlwaysKnown : 1; ///< Sample pattern is always known in client driver for MSAA depth image.
uint32 fullResolveDstOnly : 1; ///< Indicates any ICmdBuffer::CmdResolveImage using this image as a
/// desination will overwrite the entire image (width and height of
/// resolve region is same as width and height of resolve dst).
uint32 fullCopyDstOnly : 1; ///< Indicates any copy to this image will overwrite the entire image.
/// A perf optimization of using post-copy metadata fixup to replace heavy
/// expand at barrier to LayoutCopyDst. Unsafe to enable it if there is
/// potential partial copy to the image.
uint32 pipSwapChain : 1; ///< Indicates this image is PIP swap-chain. It is only supported on
/// Windows platforms.
uint32 view3dAs2dArray : 1; ///< If set client can view 3D image as 2D with its depth as array slices.
/// Note that not all 3D images supports it. The image creation will
/// return error if we fail to create a compatible image.
uint32 tmzProtected : 1; ///< Indicate this image is protected or not.
uint32 sharedWithMesa : 1; ///< Indicate this Image was opened from a Mesa shared Image
uint32 enable256KBSwizzleModes : 1; ///< Enable 256 KiB swizzle modes
uint32 hasModifier : 1; ///< Set if the image uses drm format modifier.
uint32 disableDccStateTracking : 1; ///< Disable a PAL optimization which is commonly broken by app bugs.
/// Setting this flag may increase DCC decompress overhead.
#if PAL_CLIENT_EXAMPLE
uint32 useFixedSwizzleMode : 1; ///< If set, require the fixed swizzle mode provided.
/// Fails creation on incompatible swizzles.
#else
uint32 reservedSwMode : 1; ///< Reserved for future use.
#endif
uint32 reserved : 4; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
};
/// Specifies a set of ways an image might be used by the GPU (color target, shader read, etc.).
union ImageUsageFlags
{
struct
{
uint32 shaderRead : 1; ///< Image will be read from shader (i.e., texture).
uint32 shaderWrite : 1; ///< Image will be written from a shader (i.e., UAV).
uint32 resolveSrc : 1; ///< Image will be used as resolve source image
uint32 resolveDst : 1; ///< Image will be used as resolve dst image
uint32 colorTarget : 1; ///< Image will be bound as a color target.
uint32 depthStencil : 1; ///< Image will be bound as a depth/stencil target.
uint32 noStencilShaderRead : 1; ///< Image will be neither read as stencil nor resolved on stencil plane.
/// Note that if resolveSrc bit has been set to indicate that the image
/// could be adopted as a resolveSrc image and there could be stencil
/// resolve, noStencilShaderRead must be set to 0, since shader-read
/// based stencil resolve might be performed.
uint32 hiZNeverInvalid : 1; ///< Hint to PAL indicating the client will guarantee that no operations
/// performed on this Image while it is in a decompressed state will cause
/// Hi-Z metadata to become invalid. This allows PAL to avoid an expensive
/// resummarization blit in some resource barriers.
uint32 depthAsZ24 : 1; ///< Use a 24-bit format for HW programming of a native 32-bit surface.
/// If set, border color and Z-reference values are treated as Z-24.
uint32 firstShaderWritableMip : 4; ///< Only relevant if the shaderWrite flag is set. Typically set to 0 so
/// entire image is writable. If non0, such as an image where only level0
/// is used as a color target and compute is used to generate mipmaps,PAL
/// may be able to enable additional compression on the baseLevels which
/// are used exclusively as color target and shader read.
uint32 cornerSampling : 1; ///< Set if this image will use corner sampling in image-read scenarios.
/// With corner sampling, the extent refers to the number of pixel corners
/// which will be one more than the number of pixels. Border color is
/// ignored when corner sampling is enabled.
uint32 vrsDepth : 1; ///< Set if this depth image will be bound when VRS rendering is enabled.
uint32 disableOptimizedDisplay: 1; ///< Do not create Display Dcc
uint32 useLossy : 1; ///< Set if this image may use lossy compression.
uint32 stencilOnlyTarget : 1; ///< This must be set if a stencil-only IDepthStencilView will be created
///< for this image.
uint32 vrsRateImage : 1; ///< This image is potentially used with CmdBindSampleRateImage
uint32 videoDecoder : 1; ///< Indicating this Image is video decoder target
uint32 reserved : 12; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
};
/// Specifies properties for @ref IImage creation. Input structure to IDevice::CreateImage().
///
/// Note that by default PAL may instruct the hardware to swizzle the contents of an image in memory; if this occurs
/// two images created with identical properties will not map their texels to the same offsets in GPU memory and may
/// even have different sizes. At the expense of performance this behavior can be limited by setting the invariant flag,
/// which guarantees that images with identical properties will have identical GPU memory layouts.
///
/// For single-sampled color images, there is a middle ground between these two modes. If the fixedTileSwizzle flag is
/// set, PAL will use the tileSwizzle property instead of generating its own swizzle value. The tileSwizzle value must
/// be obtained from the base subresource of a single-sampled color image with identical properties (excluding
/// fixedTileSwizzle and tileSwizzle). This allows the client to force certain similar images to share the same GPU
/// memory layouts without forcing all similar images to a single GPU memory layout.
struct ImageCreateInfo
{
ImageCreateFlags flags; ///< Image creation flags.
ImageUsageFlags usageFlags; ///< Image usage flags.
ImageType imageType; ///< Dimensionality of image (1D/2D/3D).
SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle.
Extent3d extent; ///< Dimensions in pixels WxHxD.
uint32 mipLevels; ///< Number of mipmap levels. Cannot be 0.
uint32 arraySize; ///< Number of slices. Set to 1 for non-array images.
uint32 samples; ///< Number of coverage samples. Set to 1 for single sample images. Must be
/// greater than or equal to the number of fragments.
uint32 fragments; ///< Number of color/depth fragments. Set to 1 for single sample images.
ImageTiling tiling; ///< Controls layout of pixels in the image.
ImageTilingPattern tilingPreference; ///< Controls preferred tile swizzle organization for this image.
TilingOptMode tilingOptMode; ///< Hints to pal to select the appropriate tiling mode.
uint32 tileSwizzle; ///< If fixedTileSwizzle is set, use this value for the image's base swizzle.
#if PAL_CLIENT_EXAMPLE
SwizzleMode fixedSwizzleMode; ///< For directed image tests, force a particular swizzle mode.
#endif
/// Metadata behavior mode for this image. On GPUs with GFX12-style distributed compression
/// (see supportDistributedCompression flag in @ref DeviceProperties), metadataMode only controls UMD metadata
/// (Hi-Z and Hi-S). On such GPUs, the FmaskOnly enum is ignored and treated like Default.
MetadataMode metadataMode;
MetadataTcCompatMode metadataTcCompatMode; ///< TC compat mode for this image.
/// Distributed compression contains GL2/DF DCC compression and RB backend client compression which includes
/// fragment client compression (previous FMASK compression alike) on color MSAA images and Z Plane client
/// compression on depth stencil images. Only relevant if the backing memory pages enable compression, controllable
/// by client with @ref GpuMemoryCreateInfo::compression.
CompressionMode compressionMode;
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 876
/// Client compression is part of distributed compression (aka physical compression); it can only be enabled if
/// physical compression is enabled.
///
/// On Gfx12, controls (legacy FMask based) color fragment compression and Z plane compression.
ClientCompressionMode clientCompressionMode; ///< Controls client compression behavior for this resource.
#else
TriState clientCompressionMode; ///< Controls client compression behavior for this resource.
#endif
uint32 maxBaseAlign; ///< Maximum address alignment for this image or zero for an unbounded alignment.
float imageMemoryBudget; ///< The memoryBudget value used in SW addrlib to determine the minSizeBlk for textures.
/// It must be >= 0.0. When in [0.0, 1.0) addrlib uses legacy logic to decide minSizeBlk.
/// When == 1.0, addrlib uses minimizeAlign. When > 1.0, addrlib applies memory budget
/// algorithm. Despite 1.5 in tests show significant texture allocation size reduction,
/// default value 0.0 (legacy behavior) is recommended if not specified by client.
struct
{
PrtMapType mapType; ///< Indicates what sort of PRT metadata is stored in this image. If this image is PRT
/// metadata, then it can only be associated with an image that is a power-of-two
/// multiple bigger (or the same size). Image properties needs to include
/// "PrtFeaturePrtPlus" to create PRT map images. Format must be set to X8_Unorm for
/// residency map and sampling-status map types.
Extent3d lodRegion; ///< Useful only if mapType is not "none". Defines the region size of the parent image
/// that one pixel of this image matches with. The map image can only be paired with a
/// parent image of matching dimensions. This parameter can be left at zero.
} prtPlus;
/// The following "pitch" members must be zeroed unless the client is creating a @ref ImageTiling::Linear image and
/// wishes to directly specify the image's row and depth pitches. In that case, they must be integer multiples of
/// the alignments given by @ref IDevice::GetLinearImageAlignments, called with an appropriate maxElementSize.
uint32 rowPitch; ///< The image must have this row pitch for the first mip level (in bytes).
uint32 depthPitch; ///< The image must have this depth pitch for the first mip level (in bytes).
Rational refreshRate; ///< The expected refresh rate when presenting this flippable or stereo image.
/// By default an image can only be used with image views that exactly match @ref swizzledFormat (the base format).
/// If the client wishes to create image views with other formats they must fill out the following fields.
///
/// Valid combinations of these fields include:
/// - pViewFormats = nullptr, viewFormatCount = 0: all image views must use the base format.
/// - pViewFormats = nullptr, viewFormatCount = AllCompatibleFormats: image views can use any compatible formats.
/// - Otherwise pViewFormats is an array of additional image view formats (excluding the base format) and
/// viewFormatCount is the length of that array.
///
/// @warning pViewFormats is consumed at image creation time and should not be accessed afterwards through
/// @ref GetImageCreateInfo.
uint32 viewFormatCount; ///< Must be 0, AllCompatibleFormats, or the length of pViewFormats.
const SwizzledFormat* pViewFormats; ///< See the block comment above for a full description.
#if defined(__unix__)
uint64 modifier; ///< Drm format modifier. Ignored if flags.hasModifier unset.
uint32 modifierPlaneCount; ///< Number of memory planes of drm format modifier.
gpusize modifierMemoryPlaneOffset[3]; ///< Offset of main surface, display Dcc surface and gfx Dcc surface.
#endif
};
inline constexpr bool operator==(const ImageCreateInfo& lhs, const ImageCreateInfo& rhs)
{
bool same = (lhs.flags.u32All == rhs.flags.u32All) &&
(lhs.usageFlags.u32All == rhs.usageFlags.u32All) &&
(lhs.imageType == rhs.imageType) &&
(lhs.swizzledFormat == rhs.swizzledFormat) &&
(lhs.extent == rhs.extent) &&
(lhs.mipLevels == rhs.mipLevels) &&
(lhs.arraySize == rhs.arraySize) &&
(lhs.samples == rhs.samples) &&
(lhs.fragments == rhs.fragments) &&
(lhs.tiling == rhs.tiling) &&
(lhs.tilingPreference == rhs.tilingPreference) &&
(lhs.tilingOptMode == rhs.tilingOptMode) &&
(lhs.tileSwizzle == rhs.tileSwizzle) &&
#if PAL_CLIENT_EXAMPLE
(lhs.fixedSwizzleMode == rhs.fixedSwizzleMode) &&
#endif
(lhs.metadataMode == rhs.metadataMode) &&
(lhs.metadataTcCompatMode == rhs.metadataTcCompatMode) &&
(lhs.compressionMode == rhs.compressionMode) &&
(lhs.clientCompressionMode == rhs.clientCompressionMode) &&
(lhs.maxBaseAlign == rhs.maxBaseAlign) &&
(lhs.imageMemoryBudget == rhs.imageMemoryBudget) &&
(lhs.prtPlus.mapType == rhs.prtPlus.mapType) &&
(lhs.prtPlus.lodRegion == rhs.prtPlus.lodRegion) &&
(lhs.rowPitch == rhs.rowPitch) &&
(lhs.depthPitch == rhs.depthPitch) &&
(lhs.refreshRate.numerator == rhs.refreshRate.numerator) &&
(lhs.refreshRate.denominator == rhs.refreshRate.denominator) &&
(lhs.viewFormatCount == rhs.viewFormatCount) &&
true;
#if defined(__unix__)
if (same && (lhs.flags.hasModifier != 0))
{
same = (lhs.modifier == rhs.modifier) &&
(lhs.modifierPlaneCount == rhs.modifierPlaneCount) &&
(memcmp(&lhs.modifierMemoryPlaneOffset[0],
&rhs.modifierMemoryPlaneOffset[0],
sizeof(gpusize) * lhs.modifierPlaneCount) == 0);
}
#endif
if (same && (lhs.viewFormatCount > 0) && (lhs.viewFormatCount != AllCompatibleFormats))
{
same = (memcmp(lhs.pViewFormats, rhs.pViewFormats, lhs.viewFormatCount * sizeof(SwizzledFormat)) == 0);
}
return same;
}
/// Specifies properties for presentable @ref IImage creation. Input structure to IDevice::CreatePresentableImage().
struct PresentableImageCreateInfo
{
union
{
struct
{
uint32 fullscreen : 1; ///< Image supports fullscreen presentation.
uint32 stereo : 1; ///< Image supports stereoscopic rendering and display.
/// Implies an array size of 2. Fullscreen must be set.
uint32 turbosync : 1; ///< Image supports turbosync flip
uint32 peerWritable : 1; ///< Indicates if the memory allocated will be writable by other devices
uint32 tmzProtected : 1; ///< Indicates this presenatble image's memory is tmz Protected.
#if PAL_AMDGPU_BUILD
uint32 initializeToZero : 1; ///< If set, PAL will request that the host OS zero-initializes
/// the allocation upon creation, currently, only GpuHeapLocal and
/// GpuHeapInvisible are supported.
#else
uint32 placeholder0 : 1; ///< Placeholder.
#endif
uint32 enable256KBSwizzleModes : 1; ///< Enable 256 KiB swizzle modes.
uint32 reserved : 25; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< Presentable image creation flags.
SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle.
ImageUsageFlags usage; ///< Image usage flags.
Extent2d extent; ///< Width/height of the image.
CompressionMode compressionMode; ///< Specify GFX12-style distributed compression behavior for this resource.
/// Only relevant if the backing memory pages enable compression (controllable by
/// client with the distributedCompression field in @ref GpuMemoryCreateInfo).
const IScreen* pScreen; ///< Target screen for fullscreen presentable images. Can be null if the fullscreen
/// flag is 0.
OsDisplayHandle hDisplay; ///< Display handle of the local display system only for WSI.
OsWindowHandle hWindow; ///< Window handle only for WSI.
ISwapChain* pSwapChain; ///< SwapChain object which the presentable image belongs to.
/// By default an image can only be used with image views that exactly match @ref swizzledFormat (the base format).
/// If the client wishes to create image views with other formats they must fill out the following fields.
///
/// Valid combinations of these fields include:
/// - pViewFormats = nullptr, viewFormatCount = 0: all image views must use the base format.
/// - pViewFormats = nullptr, viewFormatCount = AllCompatibleFormats: image views can use any compatible formats.
/// - Otherwise pViewFormats is an array of additional image view formats (excluding the base format) and
/// viewFormatCount is the length of that array.
///
/// @warning pViewFormats is consumed at image creation time and should not be accessed afterwards through
/// @ref GetImageCreateInfo.
uint32 viewFormatCount; ///< Must be 0, AllCompatibleFormats, or the length of pViewFormats.
const SwizzledFormat* pViewFormats; ///< See the block comment above for a full description.
};
/// Specifies properties for private screen @ref IImage image creation. Input structure to
/// IDevice::CreatePrivateScreenImage().
struct PrivateScreenImageCreateInfo
{
union
{
struct
{
uint32 invariant : 1; ///< Images with this flag set and all other creation identical are guaranteed
/// to have a consistent data layout.
uint32 reserved : 31; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< Private screen image creation flags.
SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle.
ImageUsageFlags usage; ///< Image usage flags.
Extent2d extent; ///< Width/height of the image.
IPrivateScreen* pScreen; ///< Private screen this image is created on (then this image can be used to be
/// presented on this private screen).
CompressionMode compressionMode;///< Specify GFX12-style distributed compression behavior for this resource.
/// Only relevant if the backing memory pages enable compression (controllable by
/// client with the distributedCompression field in @ref GpuMemoryCreateInfo).
/// By default an image can only be used with image views that exactly match @ref swizzledFormat (the base format).
/// If the client wishes to create image views with other formats they must fill out the following fields.
///
/// Valid combinations of these fields include:
/// - pViewFormats = nullptr, viewFormatCount = 0: all image views must use the base format.
/// - pViewFormats = nullptr, viewFormatCount = AllCompatibleFormats: image views can use any compatible formats.
/// - Otherwise pViewFormats is an array of additional image view formats (excluding the base format) and
/// viewFormatCount is the length of that array.
///
/// @warning pViewFormats is consumed at image creation time and should not be accessed afterwards through
/// @ref GetImageCreateInfo.
uint32 viewFormatCount; ///< Must be 0, AllCompatibleFormats, or the length of pViewFormats.
const SwizzledFormat* pViewFormats; ///< See the block comment above for a full description.
};
/// Specifies parameters for opening another device's image for peer access from this device. Input structure to
/// IDevice::OpenPeerImage().
struct PeerImageOpenInfo
{
const IImage* pOriginalImage; ///< Other device's image to be opened for peer access.
};
/// Specifies parameters for opening another non-PAL device's image for access from this device. Input structure to
/// IDevice::OpenExternalSharedImage().
struct ExternalImageOpenInfo
{
ExternalResourceOpenInfo resourceInfo; ///< Information describing the external image.
Extent3d extent; ///< Expected extent for the external image. This reference value would be
/// ignored and use extents from shared metadata if any dimension of the
/// reference extent is zero.
SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. Or UndefinedFormat to infer the
/// format internally.
ImageCreateFlags flags; ///< Image Creation flags.
ImageUsageFlags usage; ///< Image usage flags.
IPrivateScreen* pScreen; ///< Private screen this image is created on, or null.
gpusize gpuMemOffset; ///< GpuMemory offset
#if defined(__unix__)
gpusize dccOffset; ///< Offset of gfx Dcc surface if nonzero.
gpusize displayDccOffset; ///< Offset of display Dcc surface if nonzero.
uint64 modifier; ///< Drm format modifier, if flags.hasModifier is set.
uint32 modifierPlaneCount; ///< Number of memory planes of drm format modifier.
#endif
/// The following members must be set to zero unless the client is opening a @ref ImageTiling::Linear image with
/// specified row and depth pitches. In that case, they must be integer multiples of the alignments given by
/// @ref IDevice::GetLinearImageAlignments, called with an appropriate maxElementSize.
gpusize rowPitch; ///< Offset in bytes between the same X position on two consecutive lines
/// of the subresource.
gpusize depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive
/// slices.
};
/// Reports the overall GPU memory layout of the entire image. Output structure for IImage::GetMemoryLayout(). Unused
/// sections will have a size of zero, an offset of zero, and an alignment of one. The layout is split into:
/// + Image Data: The raw texel values for all subresources of the image.
/// + Image Metadata: Additional data that will be used to optimize GPU operations that access the image.
/// + Image Metadata Header: A special subsection of the metadata for small bits of data with weaker alignment.
struct ImageMemoryLayout
{
gpusize dataSize; ///< The size, in bytes, of the image's core data section.
gpusize dataAlignment; ///< The alignment, in bytes, of the image's core data section.
gpusize metadataOffset; ///< The offset, in bytes, of the image's metadata section.
gpusize metadataSize; ///< The size, in bytes, of the image's metadata section.
gpusize metadataAlignment; ///< The alignment, in bytes, of the image's metadata section.
gpusize metadataHeaderOffset; ///< The offset, in bytes, of the image's metadata header.
gpusize metadataHeaderSize; ///< The size, in bytes, of the image's metadata header.
gpusize metadataHeaderAlignment; ///< The alignment, in bytes, of the image's metadata header.
uint8 swizzleEqIndices[2]; ///< Which swizzle equations this image uses or InvalidSwizzleEqIndex if
/// there are no swizzle equations for this image's layout.
uint8 swizzleEqTransitionMip; ///< Before this mip level, the image uses swizzleEqIndices[0]; from this
/// mip level onwards, the image uses swizzleEqIndices[1].
uint8 swizzleEqTransitionPlane; ///< Before this mip plane, the image uses swizzleEqIndices[0]; from this
/// plane onward, the image uses swizzleEqIndices[1].
uint32 prtTileWidth; ///< Width, in texels, of a PRT tile
uint32 prtTileHeight; ///< Height, in texels, of a PRT tile
uint32 prtTileDepth; ///< Depth, in texels, of a PRT tile
uint32 prtMinPackedLod; ///< First mip level that is packed into the PRT mip tail.
uint32 prtMipTailTileCount; ///< Number of tiles in the packed mip tail. This may either indicate the
/// size per slice or per image depending on the support for
/// PrtFeaturePerLayerMipTail (@see PrtFeatureFlags)
uint32 stereoLineOffset; ///< Y offset to the right eye data, in texels
};
/// Collection of bitmasks specifying which operations are currently allowed on an image, and which queues are allowed
/// to perform those operations. Based on this information, PAL can determine the best compression state of the image.
struct ImageLayout
{
uint32 usages : 24; ///< Bitmask of @ref ImageLayoutUsageFlags values.
uint32 engines : 8; ///< Bitmask of @ref ImageLayoutEngineFlags values.
};
/**
****************************************************************************************************
* @brief
* Enumerates swizzle modes useable on any supported GPU.
* @note
* For details please check _AddrSwizzleMode
*
****************************************************************************************************
*/
enum SwizzleMode : uint32
{
SwizzleModeLinear = 0,
SwizzleMode256BS,
SwizzleMode256BD,
SwizzleMode256BR,
SwizzleMode4KbZ,
SwizzleMode4KbS,
SwizzleMode4KbD,
SwizzleMode4KbR,
SwizzleMode64KbZ,
SwizzleMode64KbS,
SwizzleMode64KbD,
SwizzleMode64KbR,
SwizzleMode64KbZT,
SwizzleMode64KbST,
SwizzleMode64KbDT,
SwizzleMode64KbRT,
SwizzleMode4KbZX,
SwizzleMode4KbSX,
SwizzleMode4KbDX,
SwizzleMode4KbRX,
SwizzleMode64KbZX,
SwizzleMode64KbSX,
SwizzleMode64KbDX,
SwizzleMode64KbRX,
SwizzleMode256KbVarZX,
SwizzleMode256KbVarSX,
SwizzleMode256KbVarDX,
SwizzleMode256KbVarRX,
///< the meaning of swizzle mode varies by generation,
/// do not compare directly with the _R / _S / _D / _S types".
SwizzleMode256B2D,
SwizzleMode4Kb2D,
SwizzleMode4Kb3D,
SwizzleMode64Kb2D,
SwizzleMode64Kb3D,
SwizzleMode256Kb2D,
SwizzleMode256Kb3D,
SwizzleMode64Kb2Dz,
SwizzleMode256Kb2Dz,
SwizzleModeCount,
};
/// Reports position and memory layout information for a specific subresource in an image. Output structure for
/// IImage::GetSubresourceLayout().
struct SubresLayout
{
uint32 elementBytes; ///< size of each element in bytes
gpusize offset; ///< Offset in bytes from the base of the image's GPU memory where the subresource starts.
gpusize swizzleOffset; ///< Offset in bytes used for supporting parameterized swizzle
gpusize size; ///< Size of the subresource in bytes.
gpusize rowPitch; ///< Offset in bytes between the same X position on two consecutive lines of the subresource.
gpusize depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices.
uint32 tileToken; ///< Token representing various tiling information necessary for determining compatible
/// optimally tiled copies.
uint32 tileSwizzle; ///< Bank/Pipe swizzle bits for macro-tiling modes.
Extent3d blockSize; ///< Size of a tile block in texels - micro tile for 1D tiling and macro tile for 2D tiling.
Offset3d mipTailCoord; ///< coords of the subresource within the mip tail
Extent3d extentTexels; ///< Unpadded extent of the subresource in texels.
Extent3d extentElements; ///< Unpadded extent of the subresource in elements.
Extent3d paddedExtent; ///< Extent of the subresource in elements, including all internal padding for this subresource.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 912
/// Reports supported engines and usages for this subresource while it can remain in its optimal compression state.
/// Clients using CmdRelease()/CmdAcquire() without complete knowledge of the application's next usage during
/// CmdRelease() or its previous usage at CmdAcquire() can treat this layout as a performant target for an
/// intermediate state that will avoid unnecessary decompressions.
///
/// This value is only valid if supportSplitReleaseAcquire is set in @ref DeviceProperties.
ImageLayout defaultGfxLayout;
#endif
SwizzledFormat planeFormat; ///< Swizzled format for plane. Planar resource like D32-S8
/// will have different swizzled format per plane.
SwizzleMode swizzleMode; ///< Swizzle mode for plane, based on AddrSwizzleMode
uint32 hwSwizzleMode; ///< Hardware Swizzle enum, enum type dependent on gfx version
};
/// Selects a specific subresource of an image resource.
///
/// Most images only have a single data plane but in some cases conceptually related data will be stored in physically
/// separate locations which we call planes. If an image only has a single plane it will always be plane 0.
/// We define the following fixed mappings for all multi-plane formats.
/// + Depth-stencil: if the image format contains depth and stencil data, plane 0 is depth and plane 1 is stencil.
/// + YUV-planar: if the image format is @ref YuvPlanar it has either two or three planes. The luma plane
/// is always plane 0. If the format is @ref ChNumFormat::YV12 it has three planes where plane 1 is the
/// red-difference chrominance plane and plane 2 is the blue-difference chrominance plane. Otherwise, plane 1
/// interleaves blue-difference and red-difference chrominance values.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 886
struct SubresId
{
uint8 plane; ///< Selects a data plane.
uint8 mipLevel; ///< Selects a mip level.
uint16 arraySlice; ///< Selects an array slice.
};
/// Defines a range of subresources.
struct SubresRange
{
SubresId startSubres; ///< First subresource in the range.
uint8 numPlanes; ///< Number of planes in the range.
uint8 numMips; ///< Number of mip levels in the range.
uint16 numSlices; ///< Number of slices in the range.
};
#else
struct SubresId
{
uint32 plane; ///< Selects a data plane.
uint32 mipLevel; ///< Selects a mip level.
uint32 arraySlice; ///< Selects an array slice.
};
/// Defines a range of subresources.
struct SubresRange
{
SubresId startSubres; ///< First subresource in the range.
uint32 numPlanes; ///< Number of planes in the range.
uint32 numMips; ///< Number of mip levels in the range.
uint32 numSlices; ///< Number of slices in the range.
};
#endif
/// A variant struct of MemoryImageCopyRegion
/// Specifies parameters for a copy from CPU memory to Image.
/// An input for Image::CopyMemoryToImage().
struct MemoryToImageCopyRegion
{
SubresId imageSubres; ///< Selects the image subresource.
Offset3d imageOffset; ///< Pixel offset to the start of the chosen subresource region.
Extent3d imageExtent; ///< Size of the image region in elements.
uint32 numSlices; ///< Number of slices the copy will span.
const void* pHostPtr; ///< Pointer to the host memory copy from.
gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines.
gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices.
};
/// A variant struct of MemoryImageCopyRegion
/// Specifies parameters for a copy from Image to CPU memory.
/// An input for Image::CopyImageToMemory().
struct ImageToMemoryCopyRegion
{
SubresId imageSubres; ///< Selects the image subresource.
Offset3d imageOffset; ///< Pixel offset to the start of the chosen subresource region.
Extent3d imageExtent; ///< Size of the image region in elements.
uint32 numSlices; ///< Number of slices the copy will span.
void* pHostPtr; ///< Pointer to the host memory copy to.
gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines.
gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices.
};
inline constexpr bool operator==(const SubresId& lhs, const SubresId& rhs)
{
return (lhs.plane == rhs.plane) &&
(lhs.mipLevel == rhs.mipLevel) &&
(lhs.arraySlice == rhs.arraySlice);
}
inline constexpr bool operator!=(const SubresId& lhs, const SubresId& rhs)
{
return ((lhs == rhs) == false);
}
inline constexpr bool operator==(const SubresRange& lhs, const SubresRange& rhs)
{
return (lhs.startSubres == rhs.startSubres) &&
(lhs.numPlanes == rhs.numPlanes) &&
(lhs.numMips == rhs.numMips) &&
(lhs.numSlices == rhs.numSlices);
}
/// Determines if two subresource ranges are overlapped.
///
/// @returns True if two subresource ranges are overlapped, false otherwise.
inline constexpr bool OverlappedSubresRanges(
const SubresRange& a,
const SubresRange& b)
{
const SubresId aStart = a.startSubres;
const SubresId bStart = b.startSubres;
return (aStart.plane < (bStart.plane + b.numPlanes)) &&
(bStart.plane < (aStart.plane + a.numPlanes)) &&
(aStart.mipLevel < (bStart.mipLevel + b.numMips)) &&
(bStart.mipLevel < (aStart.mipLevel + a.numMips)) &&
(aStart.arraySlice < (bStart.arraySlice + b.numSlices)) &&
(bStart.arraySlice < (aStart.arraySlice + a.numSlices));
}
/**
***********************************************************************************************************************
* @interface IImage
* @brief Represents an image resource that can be accessed by the GPU.
*
* @see IDevice::CreateImage()
* @see IDevice::OpenPeerImage()
***********************************************************************************************************************
*/
class IImage : public IGpuMemoryBindable
{
public:
/// Reports information on the layout of the image in memory such as core data size and metadata alignment.
///
/// @returns the reference to ImageCreateInfo
virtual const ImageMemoryLayout& GetMemoryLayout() const = 0;
/// Reports information on the full range of the image's subresources.
///
/// @param [out] pRange Reports info on the full range of the image's subresources such as number of mips and
/// planes.
///
/// @returns Success if the layout was successfully reported. Otherwise, one of the following error codes may be
/// returned:
/// + ErrorInvalidPointer if pRange is null.
virtual Result GetFullSubresourceRange(SubresRange* pRange) const = 0;
/// Reports information on the layout of the specified subresource in memory.
///
/// @param [in] subresId Selects a subresource from the image (aspect/mip/slice).
/// @param [out] pLayout Reports info on the subresource layout such as size and pitch.
///
/// @returns Success if the layout was successfully reported. Otherwise, one of the following error codes may be
/// returned:
/// + ErrorInvalidPointer if pLayout is null.
/// + ErrorInvalidValue is the subresId is out of range for this image.
virtual Result GetSubresourceLayout(
SubresId subresId,
SubresLayout* pLayout) const = 0;
#if defined(__unix__)
/// Reports information on the memory plane layout of the specified subresource in memory for image with modifier.
///
/// @param [in] memoryPlane Selects a memory plane from the image.
/// @param [out] pLayout Reports info on the subresource layout such as size and pitch.
///
/// @returns Success if the layout was successfully reported. Otherwise, one of the following error codes may be
/// returned:
/// + ErrorInvalidValue is the memory plane is out of range for this image.
virtual Result GetModifierSubresourceLayout(
uint32 memoryPlane,
SubresLayout* pLayout) const = 0;
#endif
/// Reports the create info of image.
///
/// @returns the reference to ImageCreateInfo
const ImageCreateInfo& GetImageCreateInfo() const { return m_createInfo; }
/// Returns the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @returns Pointer to client data.
void* GetClientData() const
{
return m_pClientData;
}
/// Sets the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @param [in] pClientData A pointer to arbitrary client data.
void SetClientData(
void* pClientData)
{
m_pClientData = pClientData;
}
#if defined(_WIN32)
/// This method checks if the image is an opened cross-adapter shared image on MS hybrid graphics system.
///
/// @returns True if the image is an opened cross-adapter shared image. False otherwise.
virtual bool IsCrossAdapter() const = 0;
/// Returns a special resource ID. Can be used to associate an opened resource with its original resource.
///
/// @returns The optimal sharing ID.
virtual uint64 GetOptimalSharingId() const = 0;
#endif
/// Sets level of optimal sharing by opening APIs using this optimal sharable image and pass this information to the
/// creator. This function is supposed to be called by openers only. The call by creator is ignored.
///
/// @param [in] level Level to be set to specified client API.
virtual void SetOptimalSharingLevel(
MetadataSharingLevel level) = 0;
/// Returns support level set by all possible opening APIs.
///
/// @returns A summarized supporting level.
virtual MetadataSharingLevel GetOptimalSharingLevel() const = 0;
/// Gives the client access to the resource ID used for internal Pal events.
/// EX: Resource Create, Resource Bind, Resource Destroy.
///
/// @returns The Resource ID.
virtual const void* GetResourceId() const = 0;
/// Copies data directly from CPU memory to an Image.
///
/// @param [in] pRegions Pointer to an array of Image regions to copy from.
/// @param [in] regionCount Number of regions to copy.
/// @param [in] useMemcpy Indicates that it could be copied more efficiently from host memory to image with
/// image data is already swizzled in host memory
///
/// @returns Success if the copy between the image and memory was successfully performed. Otherwise, one of the
/// returned:
/// + ErrorInvalidPointer if the memory object bound to image is null.
/// + ErrorGpuMemoryMapFailed if the memory object is busy and cannot be mapped by the OS.
/// + ErrorOutOfMemory if out of system memory.
/// + ErrorInvalidValue if copy between image and memory failed.
virtual Result CopyMemoryToImage(
const MemoryToImageCopyRegion* pRegions,
const uint32 regionCount,
bool useMemcpy) const = 0;
/// Copies data directly from an Image to CPU memory.
///
/// @param [in] pRegions Pointer to an array of Image regions to copy to.
/// @param [in] regionCount Number of regions to copy.
/// @param [in] useMemcpy Indicates that it could be copied more efficiently from image to host memory and the
/// image data will be obtained while retaining the physical layout of the image.
///
/// @returns Success if the copy between the image and memory was successfully performed. Otherwise, one of the
/// returned:
/// + ErrorInvalidPointer if the memory object bound to image is null.
/// + ErrorGpuMemoryMapFailed if the memory object is busy and cannot be mapped by the OS.
/// + ErrorOutOfMemory if out of system memory.
/// + ErrorInvalidValue if copy between image and memory failed.
virtual Result CopyImageToMemory(
const ImageToMemoryCopyRegion* pRegions,
const uint32 regionCount,
bool useMemcpy) const = 0;
/// Copies data between two images with specified regions.
///
/// @param [in] pDstImage Pointer to the destination image where the data will be copied.
/// @param [in] pImgRegions Pointer to an array of regions specifying the area of image to be copied.
/// @param [in] regionCount Number of regions to copy between the source and destination images.
///
/// @returns Success if the copy operation was successfully performed. Otherwise, one of the following error codes
/// may be returned:
/// + ErrorInvalidPointer if any of the input pointers are null.
/// + ErrorGpuMemoryMapFailed if the memory object is busy and cannot be mapped by the OS.
/// + ErrorOutOfMemory if there is insufficient memory to perform the operation.
/// + ErrorInvalidValue if copy between images failed.
virtual Result CopyBetweenImages(
IImage* pDstImage,
const ImageCopyRegion* pImgRegions,
const uint32 regionCount) const = 0;
protected:
/// @internal Constructor.
///
/// @param [in] createInfo App-specified parameters describing the desired image properties.
IImage(const ImageCreateInfo& createInfo) : m_createInfo(createInfo), m_pClientData(nullptr) { }
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~IImage() { }
/// Retained Image create info
const ImageCreateInfo m_createInfo;
private:
/// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
/// and set via SetClientData().
/// For non-top-layer objects, this will point to the layer above the current object.
void* m_pClientData;
};
} // Pal
@@ -0,0 +1,508 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palLib.h
* @brief Defines the Platform Abstraction Library (PAL) initialization and destruction functions.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palSysMemory.h"
#include "palDbgPrint.h"
/// Major interface version. Note that the interface version is distinct from the PAL version itself, which is returned
/// in @ref Pal::PlatformProperties.
///
/// @attention Updates to the major version indicate an interface change that is not backward compatible and may require
/// action from each client during their next integration. When determining if a change is backward
/// compatible, it is assumed that the client will default-initialize all structs.
///
/// @ingroup LibInit
#define PAL_INTERFACE_MAJOR_VERSION 942
/// Minimum major interface version. This is the minimum interface version PAL supports in order to support backward
/// compatibility. When it is equal to PAL_INTERFACE_MAJOR_VERSION, only the latest interface version is supported.
///
/// @ingroup LibInit
#define PAL_MINIMUM_INTERFACE_MAJOR_VERSION 872
/// Minimum supported major interface version for devdriver library. This is the minimum interface version of the
/// devdriver library that PAL is backwards compatible to.
///
/// @ingroup LibInit
#define PAL_MINIMUM_GPUOPEN_INTERFACE_MAJOR_VERSION 38
/**
***********************************************************************************************************************
* @def PAL_INTERFACE_VERSION
* @ingroup LibInit
* @brief Current PAL interface version packed into a 32-bit unsigned integer. The low 16 bits are always zero.
* They used to contain the interface minor version and remain as a placeholder in case we add it back.
*
* @see PAL_INTERFACE_MAJOR_VERSION
*
* @hideinitializer
***********************************************************************************************************************
*/
#define PAL_INTERFACE_VERSION (PAL_INTERFACE_MAJOR_VERSION << 16)
namespace Pal
{
// Forward declarations
class IPlatform;
/// This is a list of GPUs that the NULL OS layer can compile shaders to in offline mode.
enum class NullGpuId : uint32
{
Default = 0, ///< PAL gives the client an arbitrary supported null device.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888
Polaris10, ///< 8.0.3
Polaris11, ///< 8.0.3
Polaris12, ///< 8.0.3
Vega10, ///< 9.0.0
Raven, ///< 9.0.2
Vega12, ///< 9.0.4
Vega20, ///< 9.0.6
Raven2, ///< 9.0.9
Renoir, ///< 9.0.9
#endif
Navi10, ///< 10.1.0
Navi12, ///< 10.1.1
Navi14, ///< 10.1.2
Navi21, ///< 10.3.0
Navi22, ///< 10.3.1
Navi23, ///< 10.3.2
Navi24, ///< 10.3.4
Rembrandt, ///< 10.3.5
Raphael, ///< 10.3.6
Navi31, ///< 11.0.0
Navi32, ///< 11.0.1
Navi33, ///< 11.0.2
Phoenix1, ///< 11.0.3
Phoenix2, ///< 11.0.3
Strix1, ///< 11.5.0
StrixHalo, ///< 11.5.1
Krackan1, ///< 11.5.2
Navi44, ///< 12.0.0
Navi48, ///< 12.0.1
#if (PAL_CLIENT_INTERFACE_MAJOR_VERSION>= 888)
#endif
Max, ///< The maximum count of null devices.
All, ///< If you want to enumerate all null devices.
};
/// Specifies which graphics IP level (GFXIP) this device has.
enum class GfxIpLevel : uint32
{
_None = 0, ///< @internal The device does not have an GFXIP block, or its level cannot be determined
// Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either
// undefing None before including this header or using _None when dealing with PAL.
#ifndef None
None = _None, ///< The device does not have an GFXIP block, or its level cannot be determined
#endif
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888
GfxIp10_1, ///< GFXIP 10.1 (Navi1x)
GfxIp10_3, ///< GFXIP 10.3 (Navi2x, Rembrandt, Raphael, Mendocino)
GfxIp11_0, ///< GFXIP 11.0 (Navi3x, Phoenix)
GfxIp11_5, ///< GFXIP 11.5 (Strix)
GfxIp12, ///< GFXIP 12.0 (Navi4x)
#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888
GfxIp6 = 0x1,
GfxIp7 = 0x2,
GfxIp8 = 0x3,
GfxIp8_1 = 0x4,
GfxIp9 = 0x5,
GfxIp10_1 = 0x7,
GfxIp10_3 = 0x9,
GfxIp11_0 = 0xC,
GfxIp11_5 = 0xF,
GfxIp12 = 0x11,
#endif
};
/// Specifies the hardware revision. Some AMD tools hard-code these values so we cannot change them. New ASICs should
/// be added at the end of the list and be given the next highest value.
enum class AsicRevision : uint32
{
Unknown = 0x00,
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888
Tahiti = 0x01,
Pitcairn = 0x02,
Capeverde = 0x03,
Oland = 0x04,
Hainan = 0x05,
Bonaire = 0x06,
Hawaii = 0x07,
HawaiiPro = 0x08,
Kalindi = 0x0A,
Godavari = 0x0B,
Spectre = 0x0C,
Spooky = 0x0D,
Carrizo = 0x0E,
Bristol = 0x0F,
Stoney = 0x10,
Iceland = 0x11,
Tonga = 0x12,
TongaPro = Tonga,
Fiji = 0x13,
Polaris10 = 0x14,
Polaris11 = 0x15,
Polaris12 = 0x16,
Vega10 = 0x18,
Vega12 = 0x19,
Vega20 = 0x1A,
Raven = 0x1B,
Raven2 = 0x1C,
Renoir = 0x1D,
#endif
Navi10 = 0x1F, ///< 10.1.0
Navi12 = 0x21, ///< 10.1.1
Navi14 = 0x23, ///< 10.1.2
Navi21 = 0x24, ///< 10.3.0
Navi22 = 0x25, ///< 10.3.1
Navi23 = 0x26, ///< 10.3.2
Navi24 = 0x27, ///< 10.3.4
Navi31 = 0x2C, ///< 11.0.0
Navi32 = 0x2D, ///< 11.0.1
Navi33 = 0x2E, ///< 11.0.2
Rembrandt = 0x2F, ///< 10.3.5
Strix1 = 0x33, ///< 11.5.0
Raphael = 0x34, ///< 10.3.6
Phoenix1 = 0x35, ///< 11.0.3
Phoenix2 = 0x38, ///< 11.0.3
HawkPoint1 = 0x39, ///< 11.0.3
HawkPoint2 = 0x3A, ///< 11.0.3
Krackan1 = 0x3B, ///< 11.5.2
StrixHalo = 0x3C, ///< 11.5.1
Navi44 = 0x3D, ///< 12.0.0
Navi48 = 0x3E, ///< 12.0.1
};
/// Maps a null GPU ID to its associated text name.
struct NullGpuInfo
{
NullGpuId nullGpuId; ///< ID of an ASIC that PAL supports for override purposes
const char* pGpuName; ///< Text name of the ASIC specified by nullGpuId
};
/// Various IDs and info associated with a particular GPU.
struct GpuInfo
{
AsicRevision asicRev; ///< PAL specific ASIC revision identifier.
NullGpuId nullId; ///< PAL specific GPU ID supported by the NULL OS layer.
GfxIpLevel gfxIpLevel; ///< PAL specific identifier for the device's graphics IP level (GFXIP).
uint32 familyId; ///< Hardware family ID. Driver-defined identifier for a particular family of devices.
uint32 eRevId; ///< GPU emulation/internal revision ID.
uint32 revisionId; ///< GPU revision. HW-specific value differentiating between different SKUs or revisions.
uint32 gfxEngineId; ///< Coarse-grain GFX engine ID (R800, SI, etc.).
uint32 deviceId; ///< PCI device ID (e.g., Hawaii XT = 0x67B0).
const char* pGpuName; ///< ASIC name and AMDGPU target name (e.g., "NAVI31:gfx1100").
};
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 915
/// PAL client APIs.
enum class ClientApi : uint32
{
Pal = 0,
Dx9 = 1,
Dx12 = 3,
Vulkan = 4,
OpenCl = 7,
Hip = 8,
Amf = 9,
};
#else
/// The client UMD must identify its API using this enum. Some UMD builds may implement multiple APIs so they must
/// specify which API they're implementing at runtime. Note that the PAL_CLIENT macros are the preferred way to
/// implement client-specific behavior; runtime ClientApi checks should only be used when necessary.
enum class ClientApi : uint32
{
OpenCl,
Hip
};
#endif
/// Specifies properties for @ref IPlatform creation. Input structure to Pal::CreatePlatform().
struct PlatformCreateInfo
{
const Util::AllocCallbacks* pAllocCb; ///< Optional client-provided callbacks. If non-null, PAL will call the
/// specified callbacks to allocate and free all internal system
/// memory. If null, PAL will manage memory on its own through the C
/// runtime library.
const Util::LogCallbackInfo* pLogInfo; ///< Optional client-provided callback info. If non-null, Pal will
/// call the callback to pass debug prints to the client.
const char* pSettingsPath; ///< A null-terminated string describing the path to where settings are
/// located on the system. For example, on Windows, this will refer to
/// which UMD subkey to look in under a device's key. For Linux, this
/// is the path to the settings file.
union
{
struct
{
uint32 disableGpuTimeout : 1; ///< Disables GPU timeout detection (Windows only)
uint32 force32BitVaSpace : 1; ///< Forces 32bit VA space for the flat address with 32bit ISA
uint32 createNullDevice : 1; ///< Set to create a null device, so "nullGpuId" below for the
/// ID of the GPU the created device will be based on. Null
/// devices operate in IFH mode; useful for off-line shader
/// compilations.
uint32 enableSvmMode : 1; ///< Enable SVM mode. When this bit is set, PAL will reserve
/// cpu va range with size "maxSvmSize", and allow client to
/// to create gpu or pinned memory for use of Svm.
/// For detail of SVM, please refer to CreateSvmGpuMemory
uint32 requestShadowDescriptorVaRange : 1; ///< Requests that PAL provides support for the client to use
/// the @ref VaRange::ShadowDescriptorTable virtual-address
/// range. Some GPU's may not be capable of supporting this,
/// even when requested by the client.
uint32 disableInternalResidencyOpts : 1; ///< Disables residency optimizations for internal GPU memory
/// allocations. Some clients may wish to have them turned
/// off to save on system resources.
uint32 supportRgpTraces : 1; ///< Indicates that the client supports RGP tracing. PAL will
/// use this flag and the hardware support flag to setup the
/// DevDriver RgpServer.
uint32 dontOpenPrimaryNode : 1; ///< No primary node is needed (Linux only)
uint32 disableDevDriver : 1; ///< If no DevDriverMgr should be created with this Platform.
uint32 reserved : 23; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< Platform-wide creation flags.
ClientApi clientApiId; ///< Client API ID.
NullGpuId nullGpuId; ///< ID for the null device. Ignored unless the above flags.createNullDevice bit is set.
uint16 apiMajorVer; ///< Major API version number to be used by RGP. Should be set by client based on their
/// contract with RGP.
uint16 apiMinorVer; ///< Minor API version number to be used by RGP. Should be set by client based on their
/// contract with RGP.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 916
uint32 instrApiVer; /// Instrumentation specification version for API-specific SQTT instrumentation fields.
/// Should be set by client based on the SQTT instrumentation spec version being targeted.
#endif
gpusize maxSvmSize; ///< Maximum amount of virtual address space that will be reserved for SVM
};
/**
************************************************************************************************************************
* @brief Determines the amount of system memory required for a Platform object.
*
* This function must be called before any other interaction with PAL. An allocation of this amount of memory must be
* provided in the pPlacementAddr parameter of Pal::CreatePlatform.
*
* @ingroup LibInit
*
* @returns Size, in bytes, of system memory required for an IPlatform object.
************************************************************************************************************************
*/
size_t PAL_STDCALL GetPlatformSize();
/**
***********************************************************************************************************************
* @brief Creates the Platform Abstraction Library.
*
* On execution of CreatePlatform(), PAL will establish a connection for OS and KMD communication, install the specified
* system memory allocation callbacks, and initialize any global internal services. Finally, the client will be
* returned an object pointer to the instantiated platform object, which is used to query the capabilities of the
* system.
*
* @ingroup LibInit
*
* @param [in] createInfo Parameters indicating the client requirements for the platform such as allocation
callbacks or the settings path.
* @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as
* much size available here as reported by calling GetPlatformSize().
* @param [out] ppPlatform Platform object pointer to the instantiated platform. Must not be null.
*
* @returns Success if the initialization completed successfully. Otherwise, one of the following error codes may be
* returned:
* + ErrorInvalidPointer will be returned if:
* - pPlatform is null.
* - pPlacementAddr is null.
* - createInfo.pAllocCb is non-null but pfnAlloc and/or pfnFree is null.
* - createInfo.pSettingsPath is null.
* + ErrorInitializationFailed will be returned if PAL is unable to open a connection to the OS.
***********************************************************************************************************************
*/
Result PAL_STDCALL CreatePlatform(
const PlatformCreateInfo& createInfo,
void* pPlacementAddr,
IPlatform** ppPlatform);
/**
***********************************************************************************************************************
* @brief Provides an association of NULL devices and their associated text name. NULL devices operate in IFH mode
* and are primarily intended for off-line shader compilation mode. The text name is provided for end-user
* identification of the GPU device being created.
*
* @param [in,out] pNullDeviceCount On input, this is the size of the "pNullDevices" array. On output, this
* reflects the number of valid entries in the "pNullDevices" array.
* @param [out] pNullDevices Includes information on the valid NULL devices supported by the system. If
* this is NULL, then pNullDeviceCount reflects the maximum possible size of the
* null-devices array.
*
* @returns Success if the initialization completed successfully. Otherwise, one of the following error codes may be
* returned:
* + ErrorInvalidPointer will be returned if either input is NULL.
***********************************************************************************************************************
*/
Result PAL_STDCALL EnumerateNullDevices(
uint32* pNullDeviceCount,
NullGpuInfo* pNullDevices);
/**
***********************************************************************************************************************
* @brief Provides the NULL device GpuInfo data for the specified NullGpuId.
*
* @param [in] nullGpuId Null GPU ID to lookup.
* @param [out] pGpuInfo GpuInfo data on successful lookup. Must not be null.
*
* @returns Success if the lookup completed successfully. Otherwise, one of the following error codes may be returned:
* + ErrorInvalidPointer will be returned if pGpuInfo is NULL.
* + NotFound will be returned if the Null GPU ID was not found.
***********************************************************************************************************************
*/
Result PAL_STDCALL GetNullGpuInfoForNullGpuId(
NullGpuId nullGpuId,
GpuInfo* pGpuInfo);
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 933
inline Result PAL_STDCALL GetGpuInfoForNullGpuId(
NullGpuId nullGpuId,
GpuInfo* pGpuInfo)
{
return GetNullGpuInfoForNullGpuId(nullGpuId, pGpuInfo);
}
#endif
/**
***********************************************************************************************************************
* @brief Provides the NULL device GpuInfo data for the specified GPU name string.
*
* @param [in] pGpuName Name string of the GPU to lookup (e.g., "NAVI10").
* @param [out] pGpuInfo GpuInfo data on successful lookup. Must not be null.
*
* @returns Success if the lookup completed successfully. Otherwise, one of the following error codes may be returned:
* + ErrorInvalidPointer will be returned if pGpuName or pGpuInfo are NULL.
* + NotFound will be returned if the Name string was not found.
***********************************************************************************************************************
*/
Result PAL_STDCALL GetNullGpuInfoForName(
const char* pGpuName,
GpuInfo* pGpuInfo);
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 933
inline Result PAL_STDCALL GetGpuInfoForName(
const char* pGpuName,
GpuInfo* pGpuInfo)
{
return GetNullGpuInfoForName(pGpuName, pGpuInfo);
}
#endif
/**
***********************************************************************************************************************
* @brief Provides the NULL device GpuInfo data for the specified hardware revision.
*
* @param [in] asicRevision Hardware revision to lookup.
* @param [out] pGpuInfo GpuInfo data on successful lookup. Must not be null.
*
* @returns Success if the lookup completed successfully. Otherwise, one of the following error codes may be returned:
* + ErrorInvalidPointer will be returned if pGpuInfo is NULL.
* + NotFound will be returned if the hardware revision was not found.
***********************************************************************************************************************
*/
Result PAL_STDCALL GetNullGpuInfoForAsicRevision(
AsicRevision asicRevision,
GpuInfo* pGpuInfo);
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 933
inline Result PAL_STDCALL GetGpuInfoForAsicRevision(
AsicRevision asicRevision,
GpuInfo* pGpuInfo)
{
return GetNullGpuInfoForAsicRevision(asicRevision, pGpuInfo);
}
#endif
/**
***********************************************************************************************************************
* @defgroup LibInit Library Initialization and Destruction
*
* Before initializing PAL, it is important to make sure that the interface version is consistent with the client's
* expectations. The client should check @ref PAL_INTERFACE_MAJOR_VERSION to ensure the major interface version has not
* changed since the last PAL integration. Ideally, this should be performed with a compile-time assert comparing
* @ref PAL_INTERFACE_MAJOR_VERSION against a client-maintained expected major version. Minor interface version
* changes should be backward compatible, and do not require a client change to maintain previous levels of
* functionality.
*
* On startup, the client's first call to PAL must be GetPlatformSize() followed by CreatePlatform(). This function
* gives an opportunity for PAL to perform any necessary platform-wide initialization such as opening a connection for
* communication with the operating system and kernel mode driver or initializing tracking facilities for system memory
* management. CreatePlatform() returns a created IPlatform object for future interaction with PAL.
*
* PAL optionally allows the client to specify a set of memory management callbacks during initialization. If
* specified, PAL will not allocate or free any memory directly from the runtime, instead calling back to the client.
* The client (or application, if the client forwards on the requests) may be able to implement a more efficient
* allocation scheme.
*
* After a successful call to CreatePlatform(), the client should call @ref IPlatform::EnumerateDevices() in order to
* get a list of supported devices attached to the system. This function returns an array of @ref IDevice objects
* which are used by the client to query properties of the devicess and eventually execute work on those devices.
* IPlatform::EnumerateDevices() is not available to util-only clients (PAL_BUILD_CORE=0).
*
* The client may re-enumerate devices at any time by calling IPlatform::EnumerateDevices(). The client must make sure
* there is no active work on any device and that all objects associated with those devices have been destroyed.
* IPlatform::EnumerateDevices() will destroy all previously reported @ref IDevice objects and return a fresh set.
* The client is required to re-enumerate devices when it receives a ErrorDeviceLost error from PAL.
*
* After enumerating devices, either during start-up or when recovering from an ErrorDeviceLost error, the client must
* setup and finalize PAL's per-device settings. See IDevice::GetPublicSettings(), IDevice::SetDxRuntimeData(),
* IDevice::CommitSettingsAndInit(), and IDevice::Finalize() for details.
*
* After enumerating devices and finalizing them, the client may query the set of available screens. This is done by
* calling the @ref IPlatform::GetScreens() function. Note that screens are not available for DX clients. Each screen
* is accessible by zero or more of the enumerated devices. Most screens are accessible from a "main" device as well as
* several other devices which can perform cross-display Flip presents to the screen. In some configurations, screens
* may not be directly to any of PAL's devices, in which case fullscreen presents are unavailable to that screen. (This
* typically only occurs in PowerExpress configurations.) Note that when IPlatform::EnumerateDevices() is called, any
* enumerated @ref IScreen objects which existed prior to that call are invalidated for the specified platform and
* IPlatform::GetScreens() needs to be called again to get the updated list of screens.
*
* On shutdown, the client should call @ref IPlatform::Destroy() to allow PAL to cleanup and free any remaining
* platform-wide resources. The client must ensure this call is not made until all other created objects are idle and
* destroyed (if destroyable).
*
* When the client is asked to destroy a device it may call IDevice::Cleanup() to explicitly clean up the device. Some
* clients will find it necessary to call Cleanup(), for example, if their devices have OS handles that become invalid.
* Note that Cleanup() doesn't destroy the device; it will return to its initial state, as if it was newly enumerated.
***********************************************************************************************************************
*/
} // Pal
@@ -0,0 +1,187 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palMsaaState.h
* @brief Defines the Platform Abstraction Library (PAL) IMsaaState interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palDestroyable.h"
namespace Pal
{
/// Specifies conservative rasterization mode
enum class ConservativeRasterizationMode : uint8
{
Overestimate = 0x0, ///< Fragments will be generated if the primitive area covers any portion of the pixel.
Underestimate = 0x1, ///< Fragments will be generated if all of the pixel is covered by the primitive.
Count
};
/// Maximum supported number of MSAA color samples.
constexpr uint32 MaxMsaaColorSamples = 16;
/// Maximum supported number of MSAA depth samples.
constexpr uint32 MaxMsaaDepthSamples = 8;
/// Maximum supported number of MSAA fragments.
constexpr uint32 MaxMsaaFragments = 8;
/// Sampling pattern grid size. This is a quad of pixels, i.e. 2x2 grid of pixels.
constexpr Extent2d MaxGridSize = { 2, 2 };
/// The positions are rounded to 1/Pow2(SubPixelBits)
constexpr uint32 SubPixelBits = 4;
/// Each pixel is subdivided into Pow2(SubPixelBits) x Pow2(SubPixelBits) grid of possible sample locations.
constexpr Extent2d SubPixelGridSize = { 16, 16 };
/// Represents a 2D coordinate with each component in [-8/16, 7/16]
struct SampleLocation
{
int8 x; ///< X offset.
int8 y; ///< Y offset.
/// Conversion operator that does sign-extension.
operator Offset2d() const { return { x, y }; }
};
/// Specifies a custom multisample pattern for a pixel quad.
struct MsaaQuadSamplePattern
{
SampleLocation topLeft[MaxMsaaRasterizerSamples]; ///< Sample locations for TL pixel of quad.
SampleLocation topRight[MaxMsaaRasterizerSamples]; ///< Sample locations for TR pixel of quad.
SampleLocation bottomLeft[MaxMsaaRasterizerSamples]; ///< Sample locations for BL pixel of quad.
SampleLocation bottomRight[MaxMsaaRasterizerSamples]; ///< Sample locations for BR pixel of quad.
};
/// Specifies properties for creation of an @ref IMsaaState object. Input structure to IDevice::CreateMsaaState().
struct MsaaStateCreateInfo
{
uint8 coverageSamples; ///< Number of rasterizer samples. Must be greater than or equal to all sample
/// rates in the pipeline. Valid values are 1, 2, 4, 8, and 16.
uint8 exposedSamples; ///< Number of samples exposed in the pixel shader coverage mask. Must be less
/// than or equal to coverageSamples. Valid values are 1, 2, 4, and 8.
uint8 pixelShaderSamples; ///< Controls the pixel shader execution rate. Must be less than or equal to
/// coverageSamples. Valid values are 1, 2, 4, and 8. Note that value with
/// greater than 1 doesn't mean sample rate shading is enabled. Sample rate
/// shading is enabled by either @ref forceSampleRateShading or pixel shader.
uint8 depthStencilSamples; ///< Number of samples in the bound depth target. Must be less than or equal to
/// coverageSamples. Valid values are 1, 2, 4, and 8.
uint8 shaderExportMaskSamples; ///< Number of samples to use in the shader export mask. Should match the number
/// of color target fragments clamped to
/// @ref DeviceProperties imageProperties.maxMsaaFragments.
uint8 sampleClusters; ///< Number of sample clusters to control over-rasterization (all samples in a
/// cluster are rasterized if any are hit). Must be less than or equal to
/// coverageSamples. Valid values are 1, 2, 4, and 8.
uint8 alphaToCoverageSamples; ///< How many samples of quality to generate with alpha-to-coverage. Must be
/// less than or equal to coverageSamples. Valid values are 1, 2, 4, 8, and 16.
uint8 occlusionQuerySamples; ///< Controls the number of samples to use for occlusion queries.
/// This value must never exceed the MSAA rate.
uint16 sampleMask; ///< Bitmask of which color target and depth/stencil samples should be updated.
/// The lowest bit corresponds to sample 0.
/// Selects overestimate or underestimate conservative rasterization mode. Used only if
/// @ref MsaaStateCreateInfo::flags::enableConservativeRasterization is set to true.
ConservativeRasterizationMode conservativeRasterizationMode;
union
{
struct
{
uint8 enableConservativeRasterization : 1; ///< Set to true to enable conservative rasterization
uint8 enable1xMsaaSampleLocations : 1; ///< Set to true to enable 1xMSAA quad sample pattern
uint8 disableAlphaToCoverageDither : 1; ///< Disables coverage dithering.
uint8 enableLineStipple : 1; ///< Set to true to enable line stippling
uint8 forceSampleRateShading : 1; ///< Sample rate shading can be enabled by either the pixel
/// shader, or forced here with forceSampleRateShading = 1.
/// Value 0 means sample rate shading is decided by pixel shader
/// and value 1 means sample rate shading is forced enabled.
/// This bit is for openGL glMinSampleShading, where sample rate
/// shading can be enabled by glEnable(GL_SAMPLE_SHADING)
/// instead of by the pixel shader.
uint8 reserved : 3; ///< Reserved for future use
};
uint8 u8All;
} flags;
};
/**
***********************************************************************************************************************
* @interface IMsaaState
* @brief Dynamic state object controlling fixed function MSAA state.
*
* Configures sample counts of various portions of the pipeline, specifies sample positions, etc. The full range of
* EQAA hardware features are exposed.
*
* @see IDevice::CreateMsaaState
***********************************************************************************************************************
*/
class IMsaaState : public IDestroyable
{
public:
/// Returns the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @returns Pointer to client data.
void* GetClientData() const
{
return m_pClientData;
}
/// Sets the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @param [in] pClientData A pointer to arbitrary client data.
void SetClientData(
void* pClientData)
{
m_pClientData = pClientData;
}
protected:
/// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly
/// called the proper create method.
IMsaaState() : m_pClientData(nullptr) {}
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~IMsaaState() { }
private:
/// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
/// and set via SetClientData().
/// For non-top-layer objects, this will point to the layer above the current object.
void* m_pClientData;
};
} // Pal
@@ -0,0 +1,619 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palPerfExperiment.h
* @brief Defines the Platform Abstraction Library (PAL) IPerfExperiment interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palGpuMemoryBindable.h"
namespace Pal
{
/// Specifies a particular block on the GPU to gather counters for.
enum class GpuBlock : uint32
{
Cpf = 0x0,
Ia = 0x1,
Vgt = 0x2,
Pa = 0x3,
Sc = 0x4,
Spi = 0x5,
Sq = 0x6,
Sx = 0x7,
Ta = 0x8,
Td = 0x9,
Tcp = 0xA,
Tcc = 0xB,
Tca = 0xC,
Db = 0xD,
Cb = 0xE,
Gds = 0xF,
Srbm = 0x10,
Grbm = 0x11,
GrbmSe = 0x12,
Rlc = 0x13,
Dma = 0x14,
Mc = 0x15,
Cpg = 0x16,
Cpc = 0x17,
Wd = 0x18,
Tcs = 0x19,
Atc = 0x1A,
AtcL2 = 0x1B,
McVmL2 = 0x1C,
Ea = 0x1D,
Rpb = 0x1E,
Rmi = 0x1F,
Umcch = 0x20,
Ge = 0x21,
Gl1a = 0x22,
Gl1c = 0x23,
Gl1cg = 0x24,
Gl2a = 0x25, // TCA is used in Gfx9, and changed to GL2A in Gfx10
Gl2c = 0x26, // TCC is used in Gfx9, and changed to GL2C in Gfx10
Cha = 0x27,
Chc = 0x28,
Chcg = 0x29,
Gus = 0x2A,
Gcr = 0x2B,
Ph = 0x2C,
UtcL1 = 0x2D,
Ge1 = Ge,
GeDist = 0x2E,
GeSe = 0x2F,
DfMall = 0x30, // The DF subblocks have unique instances and event IDs but they all share the DF's perf counters.
SqWgp = 0x31, // SQ counters that can be sampled at WGP granularity.
Pc = 0x32,
Gl1xa = 0x33,
Gl1xc = 0x34,
Wgs = 0x35,
EaCpwd = 0x36,
EaSe = 0x37,
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 926
RlcUser = 0x38,
#else
RlcLocal = 0x38,
#endif
Count
};
/// Distinguishes between global and streaming performance monitor (SPM) counters.
enum class PerfCounterType : uint32
{
Global = 0x0, ///< Represents the traditional summary perf counters.
Spm = 0x1, ///< Represents streaming performance counters.
Spm32 = 0x2, ///< Represents 32bit streaming performance counters
Count
};
/// Reports the type of data the hardware writes for a particular counter.
enum class PerfCounterDataType : uint32
{
Uint32 = 0x0,
Uint64 = 0x1,
Count
};
/// Distinguishes between normal thread traces and streaming performance monitor (SPM) traces.
enum class PerfTraceType : uint32
{
ThreadTrace = 0x0,
SpmTrace = 0x1,
Count
};
/// Mask values ORed together to choose which shader stages a performance experiment should sample.
enum PerfExperimentShaderFlags
{
PerfShaderMaskPs = 0x01,
PerfShaderMaskVs = 0x02,
PerfShaderMaskGs = 0x04,
PerfShaderMaskEs = 0x08,
PerfShaderMaskHs = 0x10,
PerfShaderMaskLs = 0x20,
PerfShaderMaskCs = 0x40,
PerfShaderMaskAll = 0x7f,
};
/// Selects one of generic performance trace markers, which the client can use to track data of its own choosing.
enum class PerfTraceMarkerType : uint32
{
SqttA = 0x0,
SqttB = 0x1,
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 874
A = SqttA,
B = SqttB,
#endif
SpmA = 0x2,
SpmB = 0x3,
SpmC = 0x4,
SpmD = 0x5,
Count
};
/// Specifies available features in device for supporting performance measurements.
union PerfExperimentDeviceFeatureFlags
{
struct
{
uint32 counters : 1; ///< Device supports performance counters.
uint32 threadTrace : 1; ///< Device supports thread traces.
uint32 spmTrace : 1; ///< Device supports streaming perf monitor traces.
uint32 dfSpmTrace : 1; ///< Device supports streaming df perf monitor traces.
uint32 supportPs1Events : 1; ///< The thread trace HW of this Device is capable of producing event tokens
/// from the second PS backend of SC.
uint32 sqttBadScPackerId : 1; ///< Hardware is affected by bug causing the packer ID specified in new PS waves
/// to be incorrect in SQ thread trace data.
uint32 reserved : 26; ///< Reserved for future use.
};
uint32 u32All; ///< Feature flags packed as 32-bit uint.
};
/// Specifies properties for a perf counter being added to a perf experiment. Input structure to
/// IPerfExperiment::AddCounter().
///
/// A note for GpuBlock::SqWgp
/// Client of palPerfExperiment may configure counters of GpuBlock::SqWgp based on a per-wgp granularity
/// only if the following are disabled: GFXOFF, virtualization/SRIOV, VDDGFX (power down features), clock gating (CGCG)
/// and power gating. PAL expose this feature to clients.
/// If any of the conditions above cannot be met, it's the client's job to set all WGPs in the same SE to the same
/// perf counter programming. In this case, GpuBlock::SqWgp's perf counter works on a per-SE granularity.
/// Strictly speaking, it's not true that the counters work on a per-SE granularity when those power features
/// are enabled. It's all still per-WGP in HW, we just can't support different counter configs within the same SE.
/// The counter data is still reported per WGP (not aggregated for the whole SE).
///
struct PerfCounterInfo
{
PerfCounterType counterType; ///< Type of counter to add.
GpuBlock block; ///< Which block to reference.
uint32 instance; ///< Instance of that block in the device.
uint32 eventId; ///< Which event ID to track.
// Some blocks have additional per-counter controls. They must be properly programmed when adding counters for
// the relevant blocks. It's recommended to zero them out when not in use.
union
{
struct
{
uint32 eventQualifier; ///< The DF counters have an event-specific qualifier bitfield.
} df;
struct
{
uint16 eventThreshold; ///< Threshold value for those UMC counters having event-specific threshold.
uint8 eventThresholdEn; ///< Threshold enable (0 for disabled,1 for <threshold,2 for >threshold).
uint8 rdWrMask; ///< Read/Write mask select (1 for Read, 2 for Write).
} umc;
uint32 rs64Cntl; ///< CP blocks CPG and CPC have events that can be further filtered for processor events
uint32 u32All; ///< Union value for copying, must be increased in size if any element of the union exceeds
} subConfig;
};
/// Specifies properties for setting up a streaming performance counter trace. Input structure to
/// IPerfExperiment::AddSpmTrace().
struct SpmTraceCreateInfo
{
uint32 spmInterval; ///< Interval between each sample in terms of GPU sclks. Minimum of 32.
gpusize ringSize; ///< Suggested size of the SPM output ring buffer in bytes. PAL may use
/// a smaller ring in practice but it cannot exceed this size.
uint32 numPerfCounters; ///< Number of performance counters to be collected in this trace.
const PerfCounterInfo* pPerfCounterInfos; ///< Array of size numPerfCounters of PerfCounterInfo(s).
};
/// Reports layout of a single global perf counter sample.
struct GlobalSampleLayout
{
GpuBlock block; ///< Type of GPU block.
uint32 instance; ///< Which instance of that type of GPU block.
uint32 slot; ///< Slot varies in meaning per block.
uint32 eventId; ///< Sampled event ID.
PerfCounterDataType dataType; ///< What type of data is written (e.g., 32-bit uint).
gpusize beginValueOffset; ///< Offset in bytes where the sample data begins.
gpusize endValueOffset; ///< Offset in bytes where the sample data ends.
};
/// Describes the layout of global perf counter data in memory.
struct GlobalCounterLayout
{
uint32 sampleCount; ///< Number of samples described in samples[].
GlobalSampleLayout samples[1]; ///< Describes the layout of each sample. This structure is repeated (sampleCount
/// - 1) additional times.
};
/// Enumeration of SQ Thread trace token types. All versions of Thread Trace (TT) are represented. If an unsupported
/// token is enabled, no error is reported.
enum ThreadTraceTokenTypeFlags : Pal::uint32
{
Misc = 0x00000001, ///< A miscellaneous event has been sent. TT 2.3
Timestamp = 0x00000002, ///< Timestamp tokens. TT 2.3
Reg = 0x00000004, ///< Register activity token. TT 2.3
WaveStart = 0x00000008, ///< A wavefront has started. TT 2.3
WaveAlloc = 0x00000010, ///< Output space has been allocated for vertex position or color/Z. TT 2.3.
RegCsPriv = 0x00000020, ///< There has been a compute pipeline private data, state or threadgroup update. TT 2.3.
WaveEnd = 0x00000040, ///< Wavefront completion. TT 2.3
Event = 0x00000080, ///< An event has reached the top of a shader stage. TT 2.3
EventCs = 0x00000100, ///< An event has reached the top of a compute shader stage. TT 2.3
EventGfx1 = 0x00000200, ///< An event has reached the top of a shader stage for the second GFX pipe. TT 2.3
Inst = 0x00000400, ///< The shader has executed an instruction. TT 2.3
InstPc = 0x00000800, ///< The shader has explicitly written the PC value. TT 2.3
InstUserData = 0x00001000, ///< The shader has written user data into the thread trace buffer. TT 2.3
Issue = 0x00002000, ///< Provides information about instruction scheduling. TT 2.3
Perf = 0x00004000, ///< The performance counter delta has been updated. TT 2.3 and below only.
RegCs = 0x00008000, ///< A compute state update packet has been received by the SPI. TT 2.3
VmemExec = 0x00010000, ///< A previously issued VMEM instruction is now being sent to LDS/TA. TT 3.0
AluExec = 0x00020000, ///< A previously issued VALU instruction is now being executed. TT 3.0
ValuInst = 0x00040000, ///< A VALU instruction has been issued. TT 3.0.
WaveRdy = 0x00080000, ///< Mask of which waves became ready this cycle but did not issue an instruction. TT 3.0
Immed1 = 0x00100000, ///< One wave issued an immediate instruction this cycle. TT 3.0.
Immediate = 0x00200000, ///< One or more waves have issued an immediate instruction this cycle. TT 3.0.
UtilCounter = 0x00400000, ///< A new set of utilization counter values. TT 3.0.
RealTime = 0x00800000, ///< Output realtime. TT 3.3.
All = 0xFFFFFFFF ///< Enable all the above tokens.
};
/// Enumeration of register types whose reads/writes can be traced. Register reads are disabled by default as it can
/// generate a lot of traffic and cause the GPU to hang.
enum ThreadTraceRegTypeFlags : Pal::uint32
{
EventRegs = 0x00000001, ///< Event registers. TT 2.3.
DrawRegs = 0x00000002, ///< Draw registers. TT 2.3.
DispatchRegs = 0x00000004, ///< Dispatch registers. TT 2.3.
UserdataRegs = 0x00000008, ///< UserData Registers. Must be explicitly requested in TT 2.3.
MarkerRegs = 0x00000010, ///< Thread trace marker data regs. TT 2.3.
ShaderConfigRegs = 0x00000020, ///< Shader configuration state. TT 3.0.
ShaderLaunchStateRegs = 0x00000040, ///< Shader program launch state. TT 3.0.
GraphicsPipeStateRegs = 0x00000080, ///< Graphics pipeline state. TT 3.0.
AsyncComputeRegs = 0x00000100, ///< Async compute registers. TT 3.0.
GraphicsContextRegs = 0x00000200, ///< Graphics context registers. TT 3.0.
OtherConfigRegs = 0x00000400, ///< Other regs. TT 2.3.
AllRegWrites = 0x000007FF, ///< All reg writes other than OtherBusRegs.
OtherBusRegs = 0x00000800, ///< All write activity over gfx and compute buses. Debug only. TT 3.0.
AllRegReads = 0x00001000, ///< Not encouraged to be enabled. This can cause a GPU hang.
AllReadsAndWrites = 0xFFFFFFFF ///< All reads and writes. Not encouraged. This can cause a GPU hang.
};
/// Represents thread trace token types and register types that can be enabled to be reported in the trace data. If
/// a particular token type or reg type is unsupported, no error is returned and the thread trace is configured with
/// the minimum supported tokens in the user provided config.
struct ThreadTraceTokenConfig
{
/// Mask of ThreadTraceTokenTypeFlags
uint32 tokenMask;
/// Mask of ThreadTraceRegTypeFlags
uint32 regMask;
};
/// Specifies properties for a perf trace being added to a perf experiment. Input structure to
/// IPerfExperiment::AddThreadTrace().
struct ThreadTraceInfo
{
PerfTraceType traceType; ///< Type of trace to add.
uint32 instance; ///< Selected trace instance.
union
{
struct
{
// Options common to all traces
uint32 bufferSize : 1;
// Thread trace only options
uint32 threadTraceTargetSh : 1;
uint32 threadTraceTargetCu : 1;
uint32 threadTraceSh0CounterMask : 1;
uint32 threadTraceSh1CounterMask : 1;
uint32 threadTraceSimdMask : 1;
uint32 threadTraceVmIdMask : 1;
uint32 threadTraceRandomSeed : 1;
uint32 threadTraceShaderTypeMask : 1;
uint32 threadTraceIssueMask : 1;
uint32 threadTraceWrapBuffer : 1;
uint32 threadTraceStallBehavior : 1;
uint32 threadTraceTokenConfig : 1;
uint32 threadTraceStallAllSimds : 1;
uint32 threadTraceExcludeNonDetailShaderData : 1;
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 899
uint32 threadTraceEnableExecPop : 1;
#else
uint32 placeholder2 : 1;
#endif
uint32 reserved : 16;
};
uint32 u32All;
} optionFlags;
struct
{
// Options common to all traces
size_t bufferSize;
// Thread trace only options
ThreadTraceTokenConfig threadTraceTokenConfig;
uint32 threadTraceTargetSh;
uint32 threadTraceTargetCu;
uint32 threadTraceSh0CounterMask;
uint32 threadTraceSh1CounterMask;
uint32 threadTraceSimdMask;
uint32 threadTraceVmIdMask;
uint32 threadTraceRandomSeed;
PerfExperimentShaderFlags threadTraceShaderTypeMask;
uint32 threadTraceIssueMask;
bool threadTraceWrapBuffer;
uint32 threadTraceStallBehavior;
bool threadTraceStallAllSimds;
bool threadTraceExcludeNonDetailShaderData;
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 899
bool threadTraceEnableExecPop;
#endif
} optionValues;
};
/// Reports thread trace data written when the trace is stopped (copied from internal SQ registers).
struct ThreadTraceInfoData
{
uint32 curOffset; ///< Contents of SQ_THREAD_TRACE_WPTR register.
uint32 traceStatus; ///< Contents of SQ_THREAD_TRACE_STATUS register.
uint32 writeCounter; ///< Contents of SQ_THREAD_TRACE_CNTR register.
};
/// Describes the layout of a single shader engine's thread trace data.
struct ThreadTraceSeLayout
{
uint32 shaderEngine; ///< Shader engine index.
uint32 computeUnit; ///< Compute unit index.
gpusize infoOffset; ///< Offset to ThreadTraceInfoData in memory.
gpusize infoSize; ///< Size in bytes reserved for ThreadTraceInfoData.
gpusize dataOffset; ///< Offset in bytes to the actual trace data.
gpusize dataSize; ///< Amount of trace data, in bytes.
};
/// Describes how the thread trace data is laid out.
struct ThreadTraceLayout
{
uint32 traceCount; ///< Number of entries in traces[].
ThreadTraceSeLayout traces[1]; ///< ThreadTraceSeLayout repeated (traceCount - 1) times.
};
/// Describes a single SPM counter instance.
struct SpmCounterData
{
GpuBlock gpuBlock; ///< The kind of GPU block this counter measured.
uint32 instance; ///< Which specific global block instance this counter measured.
uint32 eventId; ///< The event that was measured by this counter.
uint32 offsetLo; ///< Byte offset within each sample to the lower 16-bit half of the counter data.
uint32 offsetHi; ///< Byte offset within each sample to the upper 16-bit half of the counter data.
bool is32Bit; ///< If the client must combine the independent 16-bit halves into a single 32-bit value.
/// If this is false offsetLo points to the full 16-bit data value and offsetHi is ignored.
};
/// All information required to parse the counter data out of a SpmTrace results buffer.
///
/// Note that the hardware will continue to write samples to the SPM ring buffer even if it runs out of unused space.
/// The hardware will simply wrap the ring's write pointer back around to the first sample's location. Each subsequent
/// sample will overwrite the oldest sample in the ring. When the trace is finished we will have at most @ref
/// maxNumSamples valid samples.
///
/// PAL doesn't zero out the ring memory so it's generally hard for the client to distinguish valid samples from random
/// data present in unused sample locations. PAL does guarantee that the final sample location in the ring has its
/// timestamp zeroed out before the SPM trace starts. This means this last timestamp will only be non-zero if the ring
/// has completely filled up and the WrPtr has wrapped one or more times. The client must inspect this timestamp when
/// parsing the sample data:
/// 1. The last timestamp is zero. The ring did not wrap. The oldest sample is at @ref sampleOffset. The ring's write
/// pointer tells us how many samples were written. From the write pointer onwards the ring contains invalid data.
/// 2. The last timestamp is non-zero. The ring did wrap. The ring's write pointer points to the oldest sample,
/// effectively a random sample offset into the ring. The full ring contains valid sample data but it's not in
/// oldest-to-newest order, it's shifted. The client can walk the ring from the write pointer's location (wrapping
/// as they go) to parse all @ref maxNumSamples samples out in oldest-to-newest order.
struct SpmTraceLayout
{
gpusize offset; ///< Byte offset into the bound GPU memory where the spm trace data begins.
/// The @ref wrPtrOffset and @ref sampleOffset are relative to this value.
uint32 wrPtrOffset; ///< Byte offset within SPM trace data to the HW's write pointer (WrPtr) DWORD.
/// The WrPtr's value is an offset relative to @ref sampleOffset. Don't assume this is
/// a byte offset (see @ref wrPtrGranularity). The WrPtr's value shows where the HW's
/// theoretical next sample would go. This value may wrap back to zero if the HW runs of
/// space in the SPM ring buffer.
uint32 wrPtrGranularity; ///< The WrPtr's granularity. Multiply WrPtr's value by this value to get a byte offset.
uint32 sampleOffset; ///< Byte offset within the SPM trace data to the array of samples. The HW will write the
/// first sample here but it will be overwritten if the ring wraps (see the top comment).
uint32 sampleStride; ///< The distance between consecutive samples in bytes. May include empty padding.
uint32 maxNumSamples; ///< The maximum number of samples the HW can write before wrapping. The SPM ring buffer
/// ends at sampleOffset + sampleStride * maxNumSamples.
uint32 numCounters; ///< The true length of counterData. The client must allocate extra memory for the array.
SpmCounterData counterData[1]; ///< The layout and identity of the counters in the samples.
};
/// Represents the information that is stored in the DF SPM trace metadata buffer.
struct DfSpmTraceMetadataLayout
{
uint32 numRecordPairs; ///< The number of 64-byte blocks written by this trace. There are two time segments
///< per 64-byte block so we have to check the lastSpmPkt bit to see which half of
///< the last 64-byte block is the last packet.
uint32 padding; ///< Padding to match what the compiler does by default.
uint64 beginTimestamp; ///< The DF timestamp at the start of the DF SPM trace.
uint64 endTimestamp; ///< The DF timestamp at the finish of the DF SPM trace.
};
/// Specifies properties for creation of an @ref IPerfExperiment object. Input structure to
/// IDevice::CreatePerfExperiment().
struct PerfExperimentCreateInfo
{
union
{
struct
{
uint32 cacheFlushOnCounterCollection : 1;
uint32 sampleInternalOperations : 1;
uint32 sqShaderMask : 1;
uint32 sqWgpShaderMask : 1;
uint32 reserved : 28;
};
uint32 u32All;
} optionFlags;
struct
{
bool cacheFlushOnCounterCollection;
bool sampleInternalOperations;
PerfExperimentShaderFlags sqShaderMask; ///< GpuBlock::Sq counters only look at these shader types.
PerfExperimentShaderFlags sqWgpShaderMask; ///< GpuBlock::SqWgp counters only look at these shader types.
} optionValues;
};
/**
***********************************************************************************************************************
* @interface IPerfExperiment
* @brief Set of performance profiling activities to be performed over a specific range of commands in a command
* buffer.
*
* @warning The details of building a performance experiment are not very well documented here. Please see your local
* hardware performance expert for more details until this documentation can be fully fleshed out.
*
* @see IDevice::CreatePerfExperiment
***********************************************************************************************************************
*/
class IPerfExperiment : public IGpuMemoryBindable
{
public:
/// Adds the specified performance counter to be tracked as part of this perf experiment.
///
/// @param [in] counterInfo Specifies which counter to add: which hardware block, instance, any options, etc.
///
/// @returns Success if the counter was successfully added to the experiment, otherwise an appropriate error code.
virtual Result AddCounter(
const PerfCounterInfo& counterInfo) = 0;
/// Queries the layout of counter results in memory for this perf experiment.
///
/// @param [out] pLayout Layout describing the begin and end offset of each counter in the resulting GPU memory once
/// this perf experiment is executed. Should correspond with counters added via AddCounter().
///
/// @returns Success if the layout was successfully returned in pLayout, otherwise an appropriate error code.
virtual Result GetGlobalCounterLayout(
GlobalCounterLayout* pLayout) const = 0;
/// Addes the specified thread trace to be recorded as part of this perf experiment.
///
/// @param [in] traceInfo Specifies what type of trace to record, which block instance to trace, and options, etc.
///
/// @returns Success if the trace was successfully added to the experiment, otherwise an appropriate error code.
virtual Result AddThreadTrace(
const ThreadTraceInfo& traceInfo) = 0;
/// Adds the specified DfSpmTrace to be recorded as part of this perf experiment.
///
/// @param [in] dfSpmCreateInfo Specifies the parameters of the df spm trace and
/// provides the list of perf counters.
///
/// @returns Success if the df spm trace was successfully added to the experiment,
/// otherwise and appropriate error code.
virtual Result AddDfSpmTrace(
const SpmTraceCreateInfo& dfSpmCreateInfo) = 0;
/// Adds the specified SpmTrace to be recorded as part of this perf experiment.
///
/// @param [in] spmCreateInfo Specifies the parameters of the spm trace and provides the list of perf counters.
///
/// @returns Success if the spm trace was successfully added to the experiment, otherwise an appropriate error code.
virtual Result AddSpmTrace(
const SpmTraceCreateInfo& spmCreateInfo) = 0;
/// Queries the layout of thread trace results in memory for this perf experiment.
///
/// @param [out] pLayout Layout describing how the results of each thread trace will be written to GPU memory when
/// this perf experiment is executed. Should correspond with counters added via AddTrace().
///
/// @returns Success if the layout was successfully returned in pLayout, otherwise an appropriate error code.
virtual Result GetThreadTraceLayout(
ThreadTraceLayout* pLayout) const = 0;
/// Queries the layout of streaming counter trace results in memory for this perf experiment.
///
/// The caller is expected to call this function twice. The first time with pLayout->numCounters = 0 which prompts
/// PAL to only set numCounters to the correct number of SPM counters and return. The second call with a non-zero
/// numCounters prompts PAL to fill out the full structure and counterData array.
///
/// Note that @ref SpmTraceLayout contains a variable length array. The caller must allocate enough memory for
/// an additional "numCounters - 1" copies of @ref SpmCounterData.
///
/// @param [out] pLayout Layout describing the layout of the streaming counter trace results in the resulting
/// GPU memory once this perf experiment is executed.
///
/// @returns Success if the layout was successfully returned in pLayout, otherwise an appropriate error code.
virtual Result GetSpmTraceLayout(
SpmTraceLayout* pLayout) const = 0;
/// Finalizes the performance experiment preparing it for execution.
///
/// @returns Success if the operation executed successfully, otherwise an appropriate error code.
virtual Result Finalize() = 0;
/// Returns the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @returns Pointer to client data.
void* GetClientData() const
{
return m_pClientData;
}
/// Sets the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @param [in] pClientData A pointer to arbitrary client data.
void SetClientData(
void* pClientData)
{
m_pClientData = pClientData;
}
protected:
/// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly
/// called the proper create method.
IPerfExperiment() : m_pClientData(nullptr) {}
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~IPerfExperiment() { }
private:
/// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
/// and set via SetClientData().
/// For non-top-layer objects, this will point to the layer above the current object.
void* m_pClientData;
};
} // Pal
@@ -0,0 +1,896 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palPipeline.h
* @brief Defines the Platform Abstraction Library (PAL) IPipeline interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palGpuMemoryBindable.h"
#include "palDestroyable.h"
#include "palImage.h"
#include "palShaderLibrary.h"
#include "palSpan.h"
#include <utility>
namespace Util
{
namespace Abi
{
union ApiHwShaderMapping;
enum class HardwareStage : uint32;
}
namespace HsaAbi
{
struct KernelArgument;
}
}
namespace Pal
{
struct GpuMemSubAllocInfo;
enum class PrimitiveTopology : uint8;
/// Specifies a shader type (i.e., what stage of the pipeline this shader was written for).
enum class ShaderType : uint32
{
Compute = 0,
Task,
Vertex,
Hull,
Domain,
Geometry,
Mesh,
Pixel,
Count
};
/// Number of shader program types supported by PAL.
constexpr uint32 NumShaderTypes = static_cast<uint32>(ShaderType::Count);
/// Maximum number of viewports.
constexpr uint32 MaxViewports = 16;
/// Maximum number of supported stream-output declaration entries by any PAL device.
constexpr uint32 MaxStreamOutEntries = 512;
/// Specifies a general primitive category without differentiating between a strip or list and without specifying
/// whether a the primitive will include adjacency info or not.
enum class PrimitiveType : uint32
{
Point = 0x0,
Line = 0x1,
Triangle = 0x2,
Rect = 0x3,
Quad = 0x4,
Patch = 0x5,
Count
};
/// Specifies the target range of Z values after viewport transform.
enum class DepthRange : uint32
{
ZeroToOne = 0x0,
NegativeOneToOne = 0x1,
};
/// Specifies whether the v/t texture coordinates of a point sprite map 0 to 1 from top to bottom or bottom to top.
enum class PointOrigin : uint32
{
UpperLeft = 0x0,
LowerLeft = 0x1,
Count
};
/// Specifies primitive's shade mode.
enum class ShadeMode : uint32
{
Gouraud = 0x0, ///< Gouraud shading mode, pixel shader input is interpolation of vertex
Flat = 0x1, ///< Flat shading mode, pixel shader input from provoking vertex
Count
};
/// Defines a logical operation applied between the color coming from the pixel shader and the current value in the
/// target image.
enum class LogicOp : uint32
{
Copy = 0x0,
Clear = 0x1,
And = 0x2,
AndReverse = 0x3,
AndInverted = 0x4,
Noop = 0x5,
Xor = 0x6,
Or = 0x7,
Nor = 0x8,
Equiv = 0x9,
Invert = 0xA,
OrReverse = 0xB,
CopyInverted = 0xC,
OrInverted = 0xD,
Nand = 0xE,
Set = 0xF,
};
/// Shader Engine Dispatch Interleave Size
///
/// This determines how many Threads or Threadgroups are sent to one SE before switching to the next SE.
/// Work is always distributed in Threadgroups though.
///
/// The 1D values are specified in Threads and the Threadgroups are walked in a 1D typewriter fashion.
/// The 2D values are specified in Threadgroups and also walked in typewriter fashion (in groups of the 2D pattern).
///
/// Clients should check for 1D and 2D support separately in:
/// - DeviceProperties::gfxipProperties::flags::support1dDispatchInterleave
/// - DeviceProperties::gfxipProperties::flags::support2dDispatchInterleave
///
/// Default will result in "Disable" for chips which do not support 1D or 2D.
/// Disable means that every Threadgroup is issued to the next SE.
enum class DispatchInterleaveSize : uint32
{
Default,
Disable,
_1D_64_Threads,
_1D_128_Threads,
_1D_256_Threads,
_1D_512_Threads,
_2D_1x1_ThreadGroups,
_2D_1x2_ThreadGroups,
_2D_1x4_ThreadGroups,
_2D_1x8_ThreadGroups,
_2D_1x16_ThreadGroups,
_2D_2x1_ThreadGroups,
_2D_2x2_ThreadGroups,
_2D_2x4_ThreadGroups,
_2D_2x8_ThreadGroups,
_2D_4x1_ThreadGroups,
_2D_4x2_ThreadGroups,
_2D_4x4_ThreadGroups,
_2D_8x1_ThreadGroups,
_2D_8x2_ThreadGroups,
_2D_16x1_ThreadGroups,
Count,
};
/// Specifies whether to override binning setting for pipeline. Enum value of Default follows the PBB global setting.
/// Enable or Disable value overrides PBB global setting for the pipeline and sets binning accordingly.
enum class BinningOverride : uint32
{
Default = 0x0,
Disable = 0x1,
Enable = 0x2,
Count
};
/// GPU behavior is controlled by LDS_GROUP_SIZE.
enum class LdsPsGroupSizeOverride : uint32
{
Default = 0x0,
SingleWave = 0x1,
DoubleWaves = 0x2
};
/// Tri-state enum which controls enabling or disabling a feature or behavior, or letting PAL select a sensible default
enum class OverrideMode : int32
{
Default = -1, ///< PAL selects the default behavior, which could be either enabled or disabled.
Disabled = 0, ///< Force to disabled. Equal to set to False.
Enabled = 1, ///< Force to enabled. Equal to set to True.
};
/// Enumerates the depth clamping modes a pipeline can use.
enum class DepthClampMode : uint32
{
Viewport = 0x0, ///< Clamps to the viewport min/max depth bounds
_None = 0x1, ///< Disables depth clamping
#if PAL_BUILD_SUPPORT_DEPTHCLAMPMODE_ZERO_TO_ONE
ZeroToOne = 0x2, ///< Clamps between 0.0 and 1.0.
#endif
// Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either
// undefing None before including this header or using _None when dealing with PAL.
#ifndef None
None = _None, ///< Disables depth clamping
#endif
};
/// Common flags controlling creation of both compute and graphics pipeline.
union PipelineCreateFlags
{
struct
{
uint32 clientInternal : 1; ///< Internal pipeline not created by the application.
uint32 reverseWorkgroupOrder : 1; ///< Indicates that any Dispatch using this pipeline should execute in
/// reverse workgroup order. This superceeds the flag on the CommandBuffer
/// (dispatchPingPongWalk) - always forcing reverse workgroup order! This
/// is a best effort as not all implementations or Queues may support this.
uint32 reserved : 30; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
};
/// Constant definining the max number of view instance count that is supported.
constexpr uint32 MaxViewInstanceCount = 6;
/// Specifies graphic pipeline view instancing state.
struct ViewInstancingDescriptor
{
uint32 viewInstanceCount; ///< The view instance count of the graphic pipeline
uint32 viewId[MaxViewInstanceCount]; ///< The view instance ids.
uint32 renderTargetArrayIdx[MaxViewInstanceCount]; ///< The instance render target array index, can be
/// used in hardware accelerated stereo rendering.
uint16 viewportArrayIdx[MaxViewInstanceCount]; ///< The instance viewport array index, can be
/// used in hardware accelerated stereo rendering.
bool enableMasking; ///< Indicate whether instance masking is enabled.
};
// Specifies the input parameters for the MSAA coverage out feature. MSAA coverage out is used in conjunction with a
// single sampled color image. This feature exports a mask indicating which samples would have been used if the
// image had been multi-sampled. The mask is exported to the specified channel of the MRT pointing to the rendered
// image. That is, the MRT must be an active bound render target. This MSAA mask data can then be post-processed.
struct MsaaCoverageOutDescriptor
{
union
{
struct
{
uint32 enable : 1; ///< Set to true to enable render target channel output
uint32 numSamples : 4; ///< Number of samples to export
uint32 mrt : 3; ///< Which MRT to export to.
uint32 channel : 2; ///< Which channel to export to (x = 0, y = 1, z = 2, w = 3)
uint32 reserved : 22;
};
uint32 u32All;
} flags;
};
/// Specifies properties about an indirect function belonging to a compute @ref IPipelne object. Part of the input
/// structure to IDevice::CreateComputePipeline().
struct ComputePipelineIndirectFuncInfo
{
const char* pSymbolName; ///< ELF Symbol name for the associated function. Must not be null.
gpusize gpuVirtAddr; ///< [out] GPU virtual address of the function. This is computed by PAL during
/// pipeline creation.
};
/// Specifies properties for creation of a compute @ref IPipeline object. Input structure to
/// IDevice::CreateComputePipeline().
struct ComputePipelineCreateInfo
{
PipelineCreateFlags flags; ///< Flags controlling pipeline creation.
const void* pPipelineBinary; ///< Pointer to Pipeline ELF binary implementing the Pipeline ABI
/// interface. The Pipeline ELF contains pre-compiled shaders,
/// register values, and additional metadata.
size_t pipelineBinarySize; ///< Size of Pipeline ELF binary in bytes.
uint32 maxFunctionCallDepth; ///< Maximum depth for indirect function calls. Not used for a new
/// path ray-tracing pipeline as the compiler has pre-calculated
/// stack requirements.
bool disablePartialDispatchPreemption; ///< Prevents scenarios where a subset of the dispatched thread groups are
/// preempted and the remaining thread groups run to completion. This
/// can occur when thread group granularity preemption is available and
/// instruction level (CWSR) is not. This setting is useful for allowing
/// dispatches with interdependent thread groups.
DispatchInterleaveSize interleaveSize; ///< Controls how many thread groups are sent to one SE before switching to
/// the next one.
/// PAL expects a fixed 3D thread group size for each compute pipeline but the HSA ABI supports dynamic group sizes.
/// If this pipeline's ELF binary metadata doesn't specify a fixed thread group size, this should be used to force
/// a particular thread group size. If this extent is set to all zeros PAL will use the metadata's group size.
/// This field is not supported on PAL ABI ELFs, it should be set to all zeros.
Extent3d threadsPerGroup;
TriState groupLaunchGuarantee; ///< Force the group launch guarantee mechanism on or off. This feature will throttle
/// issuing of low priority waves when it detects too many higher priority waves are
/// failing to schedule due to resource contraints.
const char* pKernelName; ///< When create pipeline with hsa ELF binary of multiple kernels, need to set one
/// kernel to create the pipeline. null means only one kernel in ELF binary.
};
/// Specifies information about the viewport behavior of an assembled graphics pipeline. Part of the input
/// structure @ref GraphicsPipelineCreateInfo.
struct ViewportInfo
{
bool depthClipNearEnable; ///< Enable clipping based on Near Z coordinate.
bool depthClipFarEnable; ///< Enable clipping based on Far Z coordinate.
DepthRange depthRange; ///< Specifies Z dimensions of screen space (i.e., post viewport transform:
/// 0 to 1 or -1 to 1).
};
/// Specifies edgeRule for rasterization
enum class EdgeRuleMode : uint32
{
D3dCompliant = 0x0, ///< Use rasterization edge-rules which comply with the D3D spec.
OpenGlDefault = 0x1, ///< Use rasterization edge-rules compatible with the default OpenGL driver.
};
/// Specifies Rasterizer state in properties for creation of a graphics
struct RasterizerState
{
PointOrigin pointCoordOrigin; ///< Controls texture coordinate orientation for point sprites.
bool expandLineWidth; ///< If true, line primitives will have their width expanded by 1/cos(a)
/// where a is the minimum angle from horizontal or vertical.
/// This can be used in conjunction with PS patching for a client to
/// implement line antialiasing.
ShadeMode shadeMode; ///< Specifies shading mode, Gouraud or Flat
bool rasterizeLastLinePixel; ///< Specifies whether to draw last pixel in a line.
bool outOfOrderPrimsEnable; ///< Enables out-of-order primitive rasterization. PAL silently
/// ignores this if it is unsupported in hardware.
bool perpLineEndCapsEnable; ///< Forces the use of perpendicular line end caps as opposed to
/// axis-aligned line end caps during line rasterization.
BinningOverride binningOverride; ///< Binning setting for this pipeline.
DepthClampMode depthClampMode; ///< Depth clamping behavior
union
{
struct
{
uint8 clipDistMaskValid : 1; ///< Whether or not @ref clipDiskMask, below, is valid.
uint8 cullDistMaskValid : 1; ///< Whether or not @ref cullDistMask, below, is valid.
uint8 reserved : 6;
};
uint8 u8All; ///< All the flags as a single value.
} flags;
uint8 cullDistMask; ///< Mask of which cullDistance exports to leave enabled.
uint8 clipDistMask; ///< Mask of which clipDistance exports to leave enabled.
bool dx10DiamondTestDisable; ///< Disable DX10 diamond test during line rasterization.
EdgeRuleMode edgeRule;
};
/// Specifies Per-MRT color target info in olor target state
struct ColorTargetInfo
{
SwizzledFormat swizzledFormat; ///< Color target format and channel swizzle. Set the format to invalid
/// if no color target will be bound at this slot.
uint8 channelWriteMask; ///< Color target write mask. Bit 0 controls the red channel, bit 1 is
/// green, bit 2 is blue, and bit 3 is alpha.
bool forceAlphaToOne; ///< Treat alpha as one regardless of the shader output. Ignored unless
/// supportAlphaToOne is set in DeviceProperties.
};
/// Specifies color target state in properties for creation of a graphics
struct ColorTargetState
{
bool alphaToCoverageEnable; ///< Enable alpha to coverage.
bool dualSourceBlendEnable; ///< Blend state bound at draw time will use a dual source blend mode.
LogicOp logicOp; ///< Logic operation to perform.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 904
bool uavExportSingleDraw; ///< When UAV export is enabled, acts as a hint that only a single draw
/// is done on a color target with this or subsequent pipelines before
/// a barrier. Improves performance by allowing pipelines to overlap.
#endif
ColorTargetInfo target[MaxColorTargets]; ///< Per-MRT color target info.
};
/// Specifies properties for creation of a graphics @ref IPipeline object. Input structure to
/// IDevice::CreateGraphicsPipeline().
struct GraphicsPipelineCreateInfo
{
PipelineCreateFlags flags; ///< Flags controlling pipeline creation.
const void* pPipelineBinary; ///< Pointer to Pipeline ELF binary implementing the Pipeline ABI
/// interface. The Pipeline ELF contains pre-compiled shaders,
/// register values, and additional metadata.
size_t pipelineBinarySize; ///< Size of Pipeline ELF binary in bytes.
const IShaderLibrary** ppShaderLibraries; ///< An array of graphics @ref IShaderLibrary object. pPipelineBinary
/// and ppShaderLibraries can't be valid at the same time.
size_t numShaderLibraries; ///< Number of graphics shaderLibrary object in ppShaderLibraries.
bool useLateAllocVsLimit; ///< If set, use the specified lateAllocVsLimit instead of PAL internally
/// determining the limit.
uint32 lateAllocVsLimit; ///< The number of VS waves that can be in flight without having param
/// cache and position buffer space. If useLateAllocVsLimit flag is set,
/// PAL will use this limit instead of the PAL-specified limit.
bool useLateAllocGsLimit; ///< If set, use the specified lateAllocVsLimit instead of PAL internally
/// determining the limit.
uint32 lateAllocGsLimit; ///< Controls GS LateAlloc val (for pos/prim allocations NOT param cache)
/// on NGG pipelines. Can be no more than 127.
struct
{
struct
{
PrimitiveType primitiveType; ///< Basic primitive category: points, line, triangles, patches.
bool topologyIsPolygon; ///< Indicates that triangle primitives are combined to represent more
/// complex polygons. Only valid for triangle primitive types.
uint32 patchControlPoints; ///< Number of control points per patch. Only required if primitiveType
/// is PrimitiveType::Patch.
} topologyInfo; ///< Various information about the primitive topology that will be used with this pipeline.
/// All of this info must be consistent with the full topology specified by
/// ICmdBuffer::SetPrimitiveTopology() when drawing with this pipeline bound.
/// Number of vertex buffer slots which are accessed by this pipeline. Behavior is undefined if the pipeline
/// tries to access a vertex buffer slot outside the range [0, vertexBufferCount). It is generally advisable
/// to make this the minimum value possible because that reduces the number of vertex buffer slots PAL has to
/// maintain for this pipeline when recording command buffers.
uint32 vertexBufferCount;
} iaState; ///< Input assembler state.
RasterizerState rsState; ///< Rasterizer state.
ColorTargetState cbState; ///< Color target state.
ViewInstancingDescriptor viewInstancingDesc; ///< Descriptor describes view instancing state
/// of the graphics pipeline
MsaaCoverageOutDescriptor coverageOutDesc; ///< Descriptor describes input parameters for MSAA coverage out.
ViewportInfo viewportInfo; ///< Viewport info.
DispatchInterleaveSize taskInterleaveSize; ///< Ignored for pipelines without a task shader. For pipelines with
/// a task shader, controls how many thread groups are sent to one
/// SE before switching to the next one.
LdsPsGroupSizeOverride ldsPsGroupSizeOverride; ///< Whether to override ldsPsGroupSize setting for pipeline.
TriState groupLaunchGuarantee; ///< Force the group launch guarantee mechanism on or off. This feature will throttle
/// issuing of low priority waves when it detects too many higher priority waves are
/// failing to schedule due to resource contraints.
bool noForceReZ; ///< Disables the ability for PAL to force ReZ modes outside of what was chosen by
/// the compiler for this pipeline.
};
/// The graphic pipeline view instancing information. This is used to determine if hardware accelerated stereo rendering
/// can be enabled for a graphic pipeline.
struct GraphicPipelineViewInstancingInfo
{
union
{
struct
{
uint32 shaderUseViewId : 1; ///< If any shader in pipeline uses view id.
uint32 gsExportRendertargetArrayIndex : 1; ///< If gs exports render target array index,
/// must be 0 if there is no gs.
uint32 gsExportViewportArrayIndex : 1; ///< If gs exports viewport array index,
/// must be 0 if there is no gs.
uint32 reserved : 29; ///< Reserved for future use.
};
uint32 apiShaderFlags;
};
const ViewInstancingDescriptor* pViewInstancingDesc; ///< View Instancing descriptor
};
/// Reports properties of a compiled pipeline. This includes hashes for the pipeline and shaders that the client can
/// use to correlate PAL pipeline/shader dumps with corresponding API-level pipelines/shaders.
struct PipelineInfo
{
const char* pName; ///< Non-owning pointer to a null-terminated string containing human-readable
///< name of this pipeline. nullptr indicates no name was provided.
PipelineHash internalPipelineHash; ///< 128-bit identifier extracted from this pipeline's ELF binary, composed of
/// the state the compiler decided was appropriate to identify the compiled
/// shaders. The lower 64 bits are "stable"; the upper 64 bits are "unique".
struct
{
ShaderHash hash; ///< Unique 128-bit identifier for this shader. 0 indicates there is no shader bound for
/// the corresponding shader stage.
} shader[NumShaderTypes]; ///< Array of per-shader pipeline properties.
union
{
struct
{
uint32 hsaAbi : 1; ///< This pipeline uses the HSA ABI (i.e. bind arguments not user-data)
uint32 usesCps : 1; ///< This pipeline uses continuations passing shaders (CPS). In an archive pipeline,
/// this bit is set iff it is set in any constituent pipeline.
uint32 cpsGlobal : 1; ///< If using continuations passing shaders (CPS), stack is in global rather than
/// scratch. In an archive pipeline, this bit is set iff it is set in any
/// constituent pipeline.
uint32 reserved : 29; ///< Reserved for future use.
};
uint32 u32All; ///< All flags combined as a single uint32.
} flags; ///< Pipeline properties.
struct
{
union
{
struct
{
uint32 perSampleShading : 1; ///< Shader instructions want per-sample execution.
uint32 usesSampleMask : 1; ///< Shader is using sample mask.
uint32 enablePops : 1; ///< Primitive order pixel shader is enabled.
uint32 reserved : 29; ///< Reserved for future use.
};
uint32 u32All; ///< All flags combined as a single uint32.
} flags;
} ps; ///< Pixel shader properties.
uint64 resourceMappingHash; ///< 64-bit hash of the resource mapping used when compiling the pipeline,
/// if available (0 otherwise).
/// The GPU address and size in bytes of the traceRays table, or { 0, 0 } if none.
/// The format of the table is private to the compiler.
/// Compiled code requires a buffer descriptor for the traceRays table to be passed in to the launch kernel
/// in a particular place.
gpusize traceRaysTable;
gpusize traceRaysTableSize;
/// Pointer to the the traceRays table for debugging purpose, or nullptr if none.
const char* pTraceRaysTable;
/// Pointer to and size in bytes of the shader identifiers table, or { nullptr, 0 } if none.
/// The table has an entry for each export and then each hit group specified when the pipeline was compiled.
/// Each entry has a pointer to the shader or hit group name in the same block of data (always 64 bits even
/// on a 32-bit host), then the 32 byte shader identifier whose format is agreed between the compiler and
/// GPURT's library code, for a total of 40 bytes per entry.
const void* pShaderIdentifiers;
size_t shaderIdentifiersSize;
uint32 unifiedRgsNameHash; ///< 32-bit hash of unified RGS name, 0 otherwise
};
/// A structure that represents any 3D arrangement of threads or thread groups as part of a compute shader dispatch.
///
/// This structure is halfway between Extent3d and Offset3d, depending on the context it may represent an offset or
/// an extent. Essentially it's meaning is tied to the concept of 3D thread or thread group grids rather than generic
/// contexts like "extent" or "offset". Whether it represents threads or thread groups is also context specific.
struct DispatchDims
{
uint32 x; ///< Threads or thread groups in the X dimension.
uint32 y; ///< Threads or thread groups in the Y dimension.
uint32 z; ///< Threads or thread groups in the Z dimension.
/// Computes the volume of this 3D arrangement of threads or thread groups.
///
/// @returns the total number of threads or threads groups this struct represents.
uint32 Flatten() const { return x * y * z; }
};
// There are some places where we'd like to directly cast DispatchDims to an array of three uint32s.
static_assert(sizeof(DispatchDims) == sizeof(uint32) * 3, "DispatchDims not castable to uint32*");
/// Component-wise addition of two DispatchDims.
///
/// @param [in] l The left-hand argument.
/// @param [in] r The right-hand argument.
///
/// @returns A new DispatchDims which contains the sum of 'l' and 'r' along each dimension.
inline DispatchDims operator+(DispatchDims l, DispatchDims r) { return {l.x + r.x, l.y + r.y, l.z + r.z}; }
/// Component-wise addition of one DispatchDims into another.
///
/// @param [in] l The left-hand argument.
/// @param [in] r The right-hand argument.
///
/// @returns A reference to 'l' after it is updated to the sum of 'l' and 'r'.
inline DispatchDims& operator+=(DispatchDims& l, DispatchDims r) { return l = (l + r); }
/// Component-wise multiplication of two DispatchDims.
///
/// @param [in] l The left-hand argument.
/// @param [in] r The right-hand argument.
///
/// @returns A new DispatchDims which contains the product of 'l' and 'r' along each dimension.
inline DispatchDims operator*(DispatchDims l, DispatchDims r) { return {l.x * r.x, l.y * r.y, l.z * r.z}; }
/// Component-wise multiplication of one DispatchDims into another.
///
/// @param [in] l The left-hand argument.
/// @param [in] r The right-hand argument.
///
/// @returns A reference to 'l' after it is updated to the product of 'l' and 'r'.
inline DispatchDims& operator*=(DispatchDims& l, DispatchDims r) { return l = (l * r); }
/// Used to represent API level shader stage.
enum ShaderStageFlagBits : uint32
{
ApiShaderStageCompute = (1u << static_cast<uint32>(ShaderType::Compute)),
ApiShaderStageTask = (1u << static_cast<uint32>(ShaderType::Task)),
ApiShaderStageVertex = (1u << static_cast<uint32>(ShaderType::Vertex)),
ApiShaderStageHull = (1u << static_cast<uint32>(ShaderType::Hull)),
ApiShaderStageDomain = (1u << static_cast<uint32>(ShaderType::Domain)),
ApiShaderStageGeometry = (1u << static_cast<uint32>(ShaderType::Geometry)),
ApiShaderStageMesh = (1u << static_cast<uint32>(ShaderType::Mesh)),
ApiShaderStagePixel = (1u << static_cast<uint32>(ShaderType::Pixel)),
};
/// Reports shader stats. Multiple bits set in the shader stage mask indicates that multiple shaders have been combined
/// due to HW support. The same information will be repeated for both the constituent shaders in this case.
struct ShaderStats
{
uint32 shaderStageMask; ///< Indicates the stages of the pipeline this shader is
/// used for. If multiple bits are set, it implies
/// shaders were merged. See @ref ShaderStageFlagBits.
CommonShaderStats common; ///< The shader compilation parameters for this shader.
/// Maximum number of VGPRs the compiler was allowed to use for this shader. This limit will be the minimum
/// of any architectural restriction and any client-requested limit intended to increase the number of waves in
/// flight.
uint32 numAvailableVgprs;
/// Maximum number of SGPRs the compiler was allowed to use for this shader. This limit will be the minimum
/// of any architectural restriction and any client-requested limit intended to increase the number of waves in
/// flight.
uint32 numAvailableSgprs;
size_t isaSizeInBytes; ///< Size of the shader ISA disassembly for this shader.
ShaderHash palShaderHash; ///< Internal hash of the shader compilation data used by PAL.
union
{
struct
{
uint32 writesUAV : 1; ///< This shader performs writes to UAVs.
uint32 writesDepth : 1; ///< Indicates explicit depth writes performed by the shader stage.
uint32 streamOut : 1; ///< The shader performs stream out of shader generated data.
uint32 reserved : 29; ///< Reserved for future use.
};
uint32 u32All; ///< All flags combined as a single uint32.
} shaderOperations; ///< Flags depicting shader operations.
struct
{
DispatchDims numThreadsPerGroup; ///< Number of compute threads per thread group in X, Y, and Z dimensions.
} cs; ///< Parameters specific to compute shader only.
union
{
struct
{
uint8 copyShaderPresent : 1; ///< Indicates that the copy shader data is valid.
uint8 reserved : 7; ///< Reserved for future use.
};
uint8 u8All; ///< All the flags as a single value.
} flags; ///< Flags related to this shader data.
CommonShaderStats copyShader; ///< This data is valid only when the copyShaderPresent flag above is set.
};
/**
***********************************************************************************************************************
* @interface IPipeline
* @brief Monolithic object containing all shaders and a large amount of "shader adjacent" state. Separate concrete
* implementations will support compute or graphics pipelines.
*
* @see IDevice::CreateComputePipeline()
* @see IDevice::CreateGraphicsPipeline()
* @see IDevice::LoadPipeline()
***********************************************************************************************************************
*/
class IPipeline : public IDestroyable
{
public:
/// Returns PAL-computed properties of this pipeline and its corresponding shaders.
///
/// @returns Property structure describing this pipeline.
virtual const PipelineInfo& GetInfo() const = 0;
/// Returns a list of GPU memory allocations used by this pipeline.
///
/// @param [in,out] pNumEntries Input value specifies the available size in pAllocInfoList; output value
/// reports the number of GPU memory allocations.
/// @param [out] pAllocInfoList If pAllocInfoList=nullptr, then pNumEntries is ignored on input. On output it
/// will reflect the number of allocations that make up this pipeline. If
/// pAllocInfoList!=nullptr, then on input pNumEntries is assumed to be the number
/// of entries in the pAllocInfoList array. On output, pNumEntries reflects the
/// number of entries in pAllocInfoList that are valid.
/// @returns Success if the allocation info was successfully written to the buffer.
/// + ErrorInvalidValue if the caller provides a buffer size that is different from the size needed.
/// + ErrorInvalidPointer if pNumEntries is nullptr.
virtual Result QueryAllocationInfo(
size_t* pNumEntries,
GpuMemSubAllocInfo* const pAllocInfoList) const = 0;
/// Gives the client access to the resource ID used for internal Pal events.
/// EX: Resource Create, Resource Bind, Resource Destroy.
///
/// @returns The Resource ID.
virtual const void* GetResourceId() const = 0;
/// Obtains the binary code object for this pipeline.
///
/// @param [in, out] pSize Represents the size of the shader ISA code.
///
/// @param [out] pBuffer If non-null, the pipeline ELF is written in the buffer. If null, the size required
/// for the pipeline ELF is given out in the location pSize.
///
/// @returns Success if the pipeline binary was fetched successfully.
/// +ErrorUnavailable if the pipeline binary was not fetched successfully.
virtual Result GetCodeObject(
uint32* pSize,
void* pBuffer) const = 0;
/// Obtains the pointer of code object with ELF format according to the shader type. Returned ELF object is not
/// guaranteed to be unique with different shader type, because a single code object can contain multiple shaders.
///
/// @param [in] shaderType The shader stage for which the code object are requested.
/// @param [out] pSize The size of the ELF binary.
///
/// @returns The pointer of ELF binary which contains requested shader stage.
virtual const void* GetCodeObjectWithShaderType(
ShaderType shaderType,
size_t* pSize) const = 0;
/// Obtains the shader pre and post compilation stats/params for the specified shader stage.
///
/// @param [in] shaderType The shader stage for which the stats are requested.
///
/// @param [out] pShaderStats Pointer to the ShaderStats structure which will be filled with the shader stats for
/// the shader stage mentioned in shaderType. This cannot be nullptr.
/// @param [in] getDisassemblySize If set to true performs disassembly on the shader binary code and reports the
/// size of the disassembly string in ShaderStats::isaSizeInBytes. Else reports 0.
/// @returns Success if the stats were successfully obtained for this shader, including the shader disassembly size.
/// +ErrorUnavailable if a wrong shader stage for this pipeline was specified, or if some internal error
/// occured.
virtual Result GetShaderStats(
ShaderType shaderType,
ShaderStats* pShaderStats,
bool getDisassemblySize) const = 0;
/// Obtains the compiled shader ISA code for the shader stage specified.
///
/// @param [in] shaderType The shader stage for which the shader cache entry is requested.
///
/// @param [in, out] pSize Represents the size of the shader ISA code.
///
/// @param [out] pBuffer If non-null, the shader ISA code is written in the buffer. If null, the size required
/// for the shader ISA is given out in the location pSize.
///
/// @returns Success if the shader ISA code was fetched successfully.
/// +ErrorUnavailable if the shader ISA code was not fetched successfully.
virtual Result GetShaderCode(
ShaderType shaderType,
size_t* pSize,
void* pBuffer) const = 0;
/// Obtains the generated performance data for the shader stage specified.
///
/// @param [in] hardwareStage The hardware stage of the shader which the performance data is requested.
/// @param [in, out] pSize Represents the size of the performance data.
/// @param [out] pBuffer If non-null, the performance data is written in the buffer. If null, the size
/// required for the performance data is given out in the location pSize.
///
/// @returns Success if the performance data was fetched successfully.
/// +ErrorUnavailable if the performance data was not fetched successfully.
virtual Result GetPerformanceData(
Util::Abi::HardwareStage hardwareStage,
size_t* pSize,
void* pBuffer) = 0;
/// Notifies PAL that this pipeline may make indirect function calls to any function contained within any of the
/// specified @ref IShaderLibrary objects. This gives PAL a chance to perform any late linking steps required to
/// valid execution of the possible function calls (this could include adjusting hardware resources such as GPRs
/// or LDS space for the pipeline).
///
/// This may be called multiple times on the same pipeline object. Subsequent calls do not invalidate the result
/// of previous calls.
///
/// This must be called prior to binding this pipeline to a command buffer which will make function calls into any
/// shader function contained within any of the specified libraries. Failure to comply is an error and will result
/// in undefined behavior.
///
/// Currently only supported on compute pipelines.
///
/// @param [in] ppLibraryList List of @ref IShaderLibrary object to link with.
/// @param [in] libraryCount Number of valid library objects in the ppLibraryList array.
///
/// @returns Success if the operation is successful. Other return codes may include:
/// + ErrorUnavailable if called on a graphics pipeline.
/// + ErrorBadPipelineData if any of the libraries in ppLibraryList are not compatible with this pipeline.
/// Reasons for incompatibility include (but are not limited to) different user-data mappings, different
/// wavefront sizes, and other reasons.
virtual Result LinkWithLibraries(
const IShaderLibrary*const* ppLibraryList,
uint32 libraryCount) = 0;
/// Sets the stack size for indirect function calls made by this pipeline. This may be smaller than or equal to the
/// stack size already determined during pipeline creation or during an earlier call to LinkWithLibraries() because
/// the client has access to more information about which functions contained in those libraries (or in the pipeline
/// itself) are actually going to be called.
///
/// Note that a future call to LinkWithLibraries() will invalidate this value and this should
/// be called again.
///
/// @param [in] stackSizeInBytes Client-specified stack size, in bytes.
virtual void SetStackSizeInBytes(
uint32 stackSizeInBytes) = 0;
/// Retrieve the stack sizes managed by compiler, including the frontend stack and the backend stack.
///
/// @param [out] pSizes To be filled with both the frontend stack size and the backend stack size, in bytes.
///
/// @returns SUCCESS
virtual Result GetStackSizes(
CompilerStackSizes* pSizes) const = 0;
/// Returns the API shader type to hardware stage mapping for the pipeline.
///
/// @returns The appropriate mapping for this pipeline.
virtual Util::Abi::ApiHwShaderMapping ApiHwShaderMapping() const = 0;
/// Given the zero-based position of a kernel argument, return a pointer to that argument's metadata.
///
/// @note Only compute pipelines using the HSA ABI have kernel arguments.
///
/// @param [in] index The zero-based position of the kernel argument to query.
///
/// @returns A pointer to the kernel argument's metadata, or null if this pipeline doesn't have this argument.
virtual const Util::HsaAbi::KernelArgument* GetKernelArgument(uint32 index) const = 0;
/// Returns the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @returns Pointer to client data.
void* GetClientData() const { return m_pClientData; }
/// Sets the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @param [in] pClientData A pointer to arbitrary client data.
void SetClientData(
void* pClientData)
{
m_pClientData = pClientData;
}
/// Get the array of underlying pipelines that this pipeline contains. For a normal non-multi-pipeline,
/// this returns a single-entry array pointing to the same IPipeline. For a multi-pipeline compiled in
/// dynamic launch mode, this returns an empty array. The contents of the returned array remain valid
/// until the IPipeline is destroyed.
///
/// @returns The array of underlying pipelines.
virtual Util::Span<const IPipeline* const> GetPipelines() const = 0;
/// Get the array of underlying shader libraries that this pipeline contains. For a normal non-multi-pipeline,
/// this returns the empty array. The contents of the returned array remain valid until the IPipeline is
/// destroyed.
///
/// @returns The array of underlying shader libraries.
virtual Util::Span<const IShaderLibrary* const> GetLibraries() const { return {}; }
protected:
/// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly
/// called the proper create method.
IPipeline() : m_pClientData(nullptr) {}
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~IPipeline() { }
private:
/// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
/// and set via SetClientData().
/// For non-top-layer objects, this will point to the layer above the current object.
void* m_pClientData;
IPipeline(const IPipeline&) = delete;
IPipeline& operator=(const IPipeline&) = delete;
};
} // Pal
@@ -0,0 +1,645 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palPlatform.h
* @brief Defines the Platform Abstraction Library (PAL) IPlatform interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palSysMemory.h"
#include "palMemTrackerImpl.h"
#include "palDestroyable.h"
#include "palDeveloperHooks.h"
// DevDriver forward declarations.
namespace DevDriver
{
class DevDriverServer;
namespace EventProtocol
{
class EventServer;
}
class SettingsRpcService;
}
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 890
namespace SettingsRpcService
{
class SettingsService;
}
#endif
#if PAL_BUILD_RDF
// GpuUtil forward declarations.
namespace GpuUtil
{
class TraceSession;
}
#endif
#if PAL_ENABLE_LOGGING
namespace Util
{
struct DbgLoggerFileSettings;
}
#endif
namespace Pal
{
// Forward declarations.
class IDevice;
class IScreen;
struct PalPlatformSettings;
enum class PalEvent : uint32;
/// Maximum number of Devices possibly attached to a system.
constexpr uint32 MaxDevices = 16;
/// Maximum number of Screens possibly attached to a Device.
constexpr uint32 MaxScreensPerDevice = 6;
/// Maximum number of Screens possibly attached to a system.
constexpr uint32 MaxScreens = (MaxScreensPerDevice * MaxDevices);
constexpr uint32 MaxModePerScreen = 64;
/// 32-bit PAL version identifier.
///
/// Version number of the PAL library. Major version is bumped on every promotion from stg/pal to stg/pal_prm. Minor
/// version is bumped when a change is cherry-picked to stg/pal_prm.
///
/// @see PlatformProperties::palVersion
struct Version
{
uint16 major; ///< Major version number.
uint16 minor; ///< Minor version number.
};
/// Union defining the DevDriver GPU ID layout.
/// This is specifically used by DevDriver across multiple tools/driver and should not be changed.
union PciId
{
struct
{
uint32 functionId : 8; ///< PCI function number in the system for this GPU.
uint32 deviceId : 8; ///< PCI device number in the system for this GPU.
uint32 busId : 8; ///< PCI bus number in the system for this GPU.
uint32 reserved : 8; ///< Reserved for future use.
};
uint32 u32All; ///< Fields packed as 32-bit uint.
};
/// Reports capabilities and general properties of this instantiation of the PAL library.
///
/// This covers any property that it platform-wide as opposed to being tied to a particular device in the system.
///
/// @see IPlatform::GetProperties
struct PlatformProperties
{
Version palVersion; ///< Version number of the PAL library. Note that this is distinct from the interface version.
/// It will be regularly updated as described in @ref Version.
union
{
struct
{
uint32 supportNonSwapChainPresents : 1; ///< If set, non-swapchain presents are supported.
uint32 supportBlockIfFlipping : 1; ///< If set, IQueue::Submit can protect against command buffers
/// that write to GPU memory queued for a flip present.
uint32 explicitPresentModes : 1; ///< If set, the PresentMode enums specified during direct and swap
/// chain presents explicitly determine the presentation method.
/// Additionally, the client must enumerate IScreens and use them
/// to explicitly manage fullscreen ownership. Otherwise, the
/// present modes are suggestions and fullscreen ownership is
/// managed internally by PAL.
uint32 reserved : 29; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
};
};
/// The client that Pal may query profile for. the order is the same as SHARED_AP_AREA in KMD escape interface
enum class ApplicationProfileClient : uint32
{
Uninitialized = 0,
Dxx,
Udx,
Cfx,
Ogl,
User3D,
Ocl,
Mmd,
Pplib,
Dal,
Chill,
Spp,
Boost,
DeLag,
Ris,
TurboSync,
DutyCycleScaling,
ProBoost,
RisWindowed,
FreeMux,
FsrOvr,
Count
};
/// Describes a primary surface view
///
/// @see IPlatform::GetPrimaryLayout()
struct PrimaryViewInfo
{
Rect rect; ///< Rectangle defining one portion of a primary surface layout.
uint32 numIndices; ///< The size of the gpuIndex array.
uint32 gpuIndex[MaxDevices]; ///< The devices in a linked adapter chain that can use this view.
};
/// Specifies output arguments for IPlatform::GetPrimaryLayout(), returning information about the layout of the primary
/// surface.
///
/// @see IPlatform::GetPrimaryLayout()
struct GetPrimaryLayoutOutput
{
uint32 numViews; ///< The number of views in the pViewInfoList array.
PrimaryViewInfo* pViewInfoList; ///< The primary surface is composed of these views.
union
{
struct
{
uint32 disablePartialCopy : 1; ///< If this flag is not set, the client can transfer the specific views of
/// primary surface to peer GPUs. Otherwise, the client must transfer the
/// whole primary surface to peer GPUs.
uint32 reserved : 31; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< specifies primary surface layout flags.
};
/// Specifies TurboSync control mode
enum class TurboSyncControlMode : uint32
{
Disable = 0, ///< Disable TurboSync
Enable = 1, ///< Enable TurboSync
UpdateAllocations = 2, ///< Update allocations only, without disable or enable TurboSync
Register = 3, ///< Register the current platform as TurboSync requested platform, doesn't actually
/// activate TurboSync.
Count
};
constexpr uint32 TurboSyncMaxSurfaces = 2; ///< Specifies maximum number of surfaces in a private TurboSync swapchain
/// Input argument for IPlatform::TurboSyncControl. TurboSync is a feature that enables app to render at higher than
/// V-Sync frame rates while still being tearing-free. It creates a private swapchain and copy application's back
/// buffer to the primary in this private swapchain when application is flipping. KMD controls the flipping of the
/// private swapchain to screen.
struct TurboSyncControlInput
{
TurboSyncControlMode mode; ///< Specifies the TurboSync control mode
uint32 vidPnSourceId; ///< The vidPnSourceId the call is targeted
/// GpuMemory of the primaries in private swapchain, per-gpu. This is indexed by the device indices enumerated by
/// the platform. Pal forwards the allocation handles (if IGpuMemory ptr is not null) to Kmd without validation.
const IGpuMemory* pPrimaryMemoryArray[MaxDevices][TurboSyncMaxSurfaces];
};
/**
************************************************************************************************************************
* @interface IPlatform
* @brief Interface representing an client-configurable context of the PAL platform.
*
* This is the root of all client interaction with PAL. Each IPlatform contains a set of the IDevice's and IScreens
* found in the system.
*
* + Creation of IDevice and IScreen objects.
* + Installation of memory management callbacks.
* + Query application profiles from the system.
************************************************************************************************************************
*/
class IPlatform : public IDestroyable
{
public:
/// Enumerates a list of available Devices.
///
/// This function creates a set of @ref IDevice objects corresponding to the devices attached to the system.
/// CreatePlatform() must be called before this function is called.
///
/// This function may be called multiple times during the lifetime of the PAL lib, in which case all previous
/// @ref IDevice and @ref IScreen objects are automatically destroyed. The client is responsible for
/// destroying all objects attached to the existing @ref IDevice objects before re-calling this function.
/// Re-enumerating Devices is required if ErrorDeviceLost is ever returned by PAL, as this may indicate a device
/// has been physically removed from the system.
///
/// @note Before IPlatform::Destroy can be called, all devices returned by IPlatform::EnumerateDevices() must be
/// destroyed.
///
/// @param [out] pDeviceCount Specifies the number of devices available in the system. This is the number of valid
/// entries in pDevices[]. Must not be null.
/// @param [out] pDevices Array to be populated with a device object pointer for each device available in the
/// system. The first *pDeviceCount entries are valid. Must not be null.
///
/// @returns Success if all Devices were successfully enumerated in pDevices[]. Otherwise, one of the following
/// error codes may be returned:
/// + ErrorInitializationFailed will be returned if PAL is unable to query the available Devices.
virtual Result EnumerateDevices(
uint32* pDeviceCount,
IDevice* pDevices[MaxDevices]) = 0;
/// Returns the storage size of the object implementing IScreen.
///
/// Use this to determine the size of each pStorage pointer passed to GetScreens.
///
/// @returns the storage size in bytes of the object implementing IScreen.
virtual size_t GetScreenObjectSize() const = 0;
/// Retrieves the list of available screens.
///
/// This function queries a set of @ref IScreen objects corresponding to the screens attached to the system.
/// CreatePlatform() and IPlatform::EnumerateDevices() must be called before this function is called.
///
/// This function may be called multiple times during the lifetime of the PAL lib. Each call returns a new
/// set of screen objects.
///
/// @ingroup LibInit
///
/// @param [out] pScreenCount Specifies the number of screens available in the system. This is the number of valid
/// entries in pScreens[] and pStorage[]. Must not be null.
/// @param [in] pStorage Array of caller-allocated storage for the screen objects. Each must be the size
/// returned by GetScreenObjectSize. Must always pre-allocate MaxScreens worth, must
/// not be NULL nor may any entry be NULL.
/// @param [out] pScreens Array to be populated with a screen pointer for each screen available in the system.
/// The first *pScreenCount entries are valid. Must not be null.
///
/// @note pScreens[i] uses the storage from pStorage[i]. pStorage[i] is unused for i >= *pScreenCount.
///
/// @returns Success if all screens were successfully retrieved in pScreens[]. Otherwise, one of the following
/// error codes may be returned:
/// + ErrorUnavailable if this was called prior to IPlatform::EnumerateDevices().
virtual Result GetScreens(
uint32* pScreenCount,
void* pStorage[MaxScreens],
IScreen* pScreens[MaxScreens]) = 0;
/// Queries a client specified application profile in raw format.
///
/// This function queries the kernel-mode driver to determine if there is a platform-wide profile for a specific
/// application that the client would like to honor. It is optional, and doesn't need to be called if the client
/// does not wish to support application profiles.
///
/// As the format of profile is client specified, the profile will be returned in raw format and client has the
/// responsibility to parse the profile. @see GpuUtil::ProfileIterator provides a basic capability to iterate all
/// properties in the raw data packet. The memory storing the raw data is managed by Pal.
///
/// The pFilename string can be the EXE name, like "doom.exe", or the "Content Distribution Network" (CDN) ID,
/// like "SteamAppId:570". You can use the function GpuUtil::QueryAppContentDistributionId() to get the CDN ID.
///
/// @ingroup LibInit
///
/// @param [in] pFilename Filename of the application or the Steam/EA/UPlay game ID to query for its profile.
/// See GpuUtil::QueryAppContentDistributionId().
/// @param [in] pPathname Optional. Allows the caller to specify a pathname in addition to a filename if they wish.
/// @param [in] client Client name that KMD will query the profile for
/// @param [out] pOut Will be filled with the application profile string if the profile exists and was
/// successfully queried.
///
/// @returns Success if the application profile exists for the specified string(s) and the profile was successfully
/// retrieved, or Unsupported if the profile does not exist and the query was successfully performed.
/// Otherwise, one of the following error codes may be returned:
/// + ErrorInvalidPointer will be returned if pFilename or pOut is null.
/// + ErrorUnavailable if this is called before IPlatform::EnumerateDevices(), or if there were no Devices
/// discovered.
virtual Result QueryRawApplicationProfile(
const wchar_t* pFilename,
const wchar_t* pPathname,
ApplicationProfileClient client,
const char** pOut) = 0;
/// Enable UMD side support for the SPP feature (Shader Profiling for Power). The gist of the initial version
/// of this feature is that we will profile important applications to determine which shaders are heavily memory
/// bound, then use these profiles to program the RLC to dynamically reduce engine clocks when running such shaders.
/// This should result in power savings with a limited perf impact.
/// Each of these app profiles will include one or more tables specifying how to program the RLC.
/// There may be multiple tables in cases where we need different RLC programming based on user controlled factors
/// that affect memory boundedness: resolution, MSAA rate, etc.
/// @ingroup LibInit
///
/// @param [in] pFilename Filename of the application or the Steam/EA/UPlay game ID to query for its profile.
/// See GpuUtil::QueryAppContentDistributionId().
/// @param [in] pPathname Optional. Allows the caller to specify a pathname in addition to a filename if they wish.
/// @returns Success if the application profile exists for the specified string(s) and the profile was successfully
/// retrieved, or Unsupported if the profile does not exist and the query was successfully performed.
/// Otherwise, one of the following error codes may be returned:
/// + ErrorInvalidPointer will be returned if pFilename is null.
/// + ErrorUnavailable if this is called before IPlatform::EnumerateDevices(), or if there were no Devices
/// discovered.
virtual Result EnableSppProfile(
const wchar_t* pFilename,
const wchar_t* pPathname) = 0;
/// Reports the properties of the platform.
///
/// Returns the capabilities and general properties of this platform instantiation.
///
/// @param [out] pProperties Capabilities and general properties of this platform instantiation (not tied to a
/// particular device).
///
/// @returns Success if the properties were successfully queried and returned in pProperties. Otherwise, one of the
/// following errors may be returned:
/// + ErrorInvalidPointer if pProperties is null.
virtual Result GetProperties(
PlatformProperties* pProperties) = 0;
/// Installs the callback into the specified platform.
///
/// @param [in] pPlatform The platform to install the callback into.
/// @param [in] pfnDeveloperCb The developer callback function pointer to be executed by the pPlatform.
/// @param [in] pPrivateData Private data that is installed with the callback for use by the installer.
static void InstallDeveloperCb(
IPlatform* pPlatform,
Developer::Callback pfnDeveloperCb,
void* pPrivateData)
{ pPlatform->InstallDeveloperCb(pfnDeveloperCb, pPrivateData); };
/// Returns the currently enabled developer callback types.
///
/// @returns The bitmask of currently enabled developer callback types
virtual uint32 GetEnabledCallbackTypes() const = 0;
/// Sets the currently enabled developer callback types.
///
/// @param [in] callbackTypeMask A bitmask of client requested developer callback types to enable
virtual void SetEnabledCallbackTypes(
uint32 enabledCallbackTypesMask) = 0;
/// Returns a pointer to the developer driver server object if developer mode is enabled on the system.
///
/// @returns A valid DevDriver::DevDriverServer pointer if developer mode is enabled. If developer mode is not
/// enabled, nullptr will be returned.
virtual DevDriver::DevDriverServer* GetDevDriverServer() = 0;
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 890
/// Will be replaced by GetSettingsRpcService().
virtual SettingsRpcService::SettingsService* GetSettingsService() = 0;
#endif
/// Client drivers can register their DevDriver based settings components via SettingsRpcService.
///
/// @returns A pointer to a SettingsRpcService object. Could be nullptr if developer driver mode is not enabled.
virtual DevDriver::SettingsRpcService* GetSettingsRpcService() = 0;
/// Returns a pointer to the event server object. The event server will soon move out of the DevDriver
/// server. Hence the need to provide a separate interface to access the event server.
///
/// @returns A valid EventServer pointer or nullptr if not valid.
virtual DevDriver::EventProtocol::EventServer* GetEventServer() = 0;
#if PAL_BUILD_RDF
/// Returns a pointer to the current trace session if one was created during startup
///
/// @returns A valid TraceSession pointer if a session currently exists. If a trace session was not created during
/// startup, nullptr will be returned.
virtual GpuUtil::TraceSession* GetTraceSession() = 0;
/// Indicate frame count increment to frame trace controller.
/// Client driver is responsible for calling this method once per frame.
///
/// @param [in] pQueue The queue on which a new frame has been detected
virtual void UpdateFrameTraceController(
IQueue *pQueue) = 0;
#endif
/// Gets the GPU ID for a given pal device index.
///
/// The GPU ID is determined from (BusID << 16) | (DeviceID << 8) | FunctionID
///
/// @param [in] gpuIndex The index of the GPU
///
/// @returns A GPU ID described above or UnknownGPUID if the device doesn't exist
virtual PciId GetPciId(uint32 gpuIndex) = 0;
/// Indicates whether tracing has been enabled.
///
/// @returns True if tracing is enabled, false otherwise.
virtual bool IsTracingEnabled() const = 0;
/// Indicates whether the driver has been signaled to enable crash analysis mode.
///
/// @returns True if crash analysis is enabled, false otherwise.
virtual bool IsCrashAnalysisModeEnabled() const = 0;
/// Indicates whether the driver has been signaled to enable Raytracing Shader Data Tokens.
///
/// @returns True if Raytracing Shader Data Tokens is enabled, false otherwise.
virtual bool IsRaytracingShaderDataTokenRequested() const = 0;
/// Returns a pointer to the Platform settings structure
///
/// @returns A reference to a PalPlatformSettings structure.
virtual const PalPlatformSettings& PlatformSettings() const = 0;
/// Get primary surface layout based upon VidPnSource provided by client.
///
/// This function is used by client to query the layout of the primary surface. The layout describes how primary
/// surface is composed with a set of views. Each view provides the rectangle of the surface area and the GPUs
/// this surface area will be displayed on.
/// Client should make first call pass in pPrimaryLayoutOutput->pViewInfoList as NULL to query the number of views
/// this primary surface has.
/// Client then based on pPrimaryLayoutOutput->numViews, allocates the buffer for pViewInfoList. And client then
/// makes the escape call again to query the actual view information.
///
/// @param [in] vidPnSourceId VidPnSource ID that's associated to a primary surface.
/// @param [in, out] pPrimaryLayoutOutput Primary surface layout output arguments.
///
/// @returns Success if the display layout on given vidPnSourceId was successfully queried.
/// Otherwise, one of the following errors may be returned:
/// + ErrorInvalidValue if pPrimaryLayoutOutput is invalid.
/// + ErrorUnavailable if no implementation on current platform.
/// + ErrorOutOfMemory if there is not enough system memory.
virtual Result GetPrimaryLayout(
uint32 vidPnSourceId,
GetPrimaryLayoutOutput* pPrimaryLayoutOutput) = 0;
/// Calls TurboSyncControl escape to control TurboSync on specific vidPnSourceId.
///
/// The function is called when clients intend to toggle TurboSync on a vidPnSourceId. The client should allocate
/// private swapchain primary surfaces that's compatible with the application swapchain primaries. When used to
/// activate TurboSync, the private primaries' handles needs to be passed in the TurboSyncControlInput data.
///
/// @param [in] turboSyncControlInput TurboSyncControl input arguments. See TurboSyncControlInput.
///
/// @returns Success if the TurboSyncControl request is handled successfully.
virtual Result TurboSyncControl(
const TurboSyncControlInput& turboSyncControlInput) = 0;
/// Returns the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @returns Pointer to client data.
void* GetClientData() const
{
return m_pClientData;
}
/// Sets the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @param [in] pClientData A pointer to arbitrary client data.
void SetClientData(
void* pClientData)
{
m_pClientData = pClientData;
}
/// Allocates memory using the platform's ForwardAllocator.
///
/// @param [in] allocInfo @see Util::AllocInfo
///
/// @returns Pointer to the allocated memory on success, nullptr on failure.
void* Alloc(const Util::AllocInfo& allocInfo)
{
#if PAL_MEMTRACK
return m_memTracker.Alloc(allocInfo);
#else
return m_allocator.Alloc(allocInfo);
#endif
}
/// Frees memory using the platform's ForwardAllocator.
///
/// @param [in] freeInfo @see Util::FreeInfo
void Free(const Util::FreeInfo& freeInfo)
{
#if PAL_MEMTRACK
m_memTracker.Free(freeInfo);
#else
m_allocator.Free(freeInfo);
#endif
}
/// Logs a text string via the developer driver bus if it is currently connected.
///
/// @param [in] level Log priority level associated with the message.
/// @param [in] categoryMask Log category mask that represents what category fields the message relates to.
/// @param [in] pFormat Format string for the log message.
/// @param [in] args Variable arguments that correspond to the format string.
virtual void LogMessage(LogLevel level,
LogCategoryMask categoryMask,
const char* pFormat,
va_list args) = 0;
/// Logs a text string via the developer driver bus if it is currently connected.
///
/// @param [in] level Log priority level associated with the message.
/// @param [in] categoryMask Log category mask that represents what category fields the message relates to.
/// @param [in] pFormat Format string for the log message.
/// @param [in] ... Variable arguments that correspond to the format string.
void LogMessage(LogLevel level,
LogCategoryMask categoryMask,
const char* pFormat,
...)
{
va_list args;
va_start(args, pFormat);
LogMessage(level, categoryMask, pFormat, args);
va_end(args);
}
/// Logs an event using the DevDriver protocol.
///
/// @param [in] eventId The type of event you want to log.
/// @param [in] pEventData A pointer to the struct corresponding to the event id.
/// @param [in] eventDataSize The size of the event data struct.
virtual void LogEvent(
PalEvent eventId,
const void* pEventData,
uint32 eventDataSize) {}
#if PAL_ENABLE_LOGGING
/// Function to access the current settings of file logger.
/// Clients can call this function to get file logger settings in order to configure
/// this logger at the time of its creation.
///
/// @param [in] pSettings A struct in which file logger settings are copied.
virtual void GetDbgLoggerFileSettings(
Util::DbgLoggerFileSettings* pSettings) = 0;
#endif
protected:
/// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly
/// called the proper create method.
IPlatform(
const Util::AllocCallbacks& allocCb)
:
#if PAL_MEMTRACK
m_memTracker(&m_allocator),
#endif
m_allocator(allocCb),
m_pClientData(nullptr) { }
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~IPlatform() { }
/// @internal Initialization common to all platforms; must be called in subclass overrides of this function.
/// Currently only handles initialization of the memory leak tracker.
virtual Result Init()
{
#if PAL_MEMTRACK
return m_memTracker.Init();
#else
return Result::Success;
#endif
}
/// Used by the InstallDeveloperCb to install the event handler according to the derived platform.
///
/// @param [in] pfnDeveloperCb The developer callback function pointer to be executed by the pPlatform.
/// @param [in] pPrivateData Private data that is installed with the event handler for use by the installer.
virtual void InstallDeveloperCb(
Developer::Callback pfnDeveloperCb,
void* pPrivateData) = 0;
#if PAL_MEMTRACK
/// @internal Memory leak tracker. Requires an allocator in order to perform the actual allocations. We can't
/// provide this platform because that would result in a stack overflow. We must give it our forward allocator.
Util::MemTracker<Util::ForwardAllocator> m_memTracker;
#endif
/// @internal Memory allocator. Calls to Alloc() and Free() are chained down to the allocator's counterparts.
Util::ForwardAllocator m_allocator;
private:
/// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
/// and set via SetClientData().
/// For non-top-layer objects, this will point to the layer above the current object.
void* m_pClientData;
};
} // Pal
@@ -0,0 +1,234 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palQueryPool.h
* @brief Defines the Platform Abstraction Library (PAL) IQueryPool interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palGpuMemoryBindable.h"
namespace Pal
{
/// Specifies a category of GPU query pool.
enum class QueryPoolType : uint32
{
Occlusion = 0x0, ///< Occlusion query pool. Supports queries based on the Z test.
PipelineStats = 0x1, ///< Pipeline stats query pool. Supports queries based on statistics from the GPU's execution
/// such as a count of prims generated, shader invocations, etc.
StreamoutStats = 0x2, ///< Streamout query pool. Supports queries based on statistics from the GPU's execution
/// such as number of primitives written to SO buffer and storage needed.
Count,
};
/// Specifies what data a query slot must produce. Some query pool types support multiple query types.
enum class QueryType : uint32
{
Occlusion = 0x0, ///< The total passes recorded by the Z test.
BinaryOcclusion = 0x1, ///< One if there were one or more Z test passes, zero otherwise.
PipelineStats = 0x2, ///< The total statistics selected by the given pipeline stats query pool.
StreamoutStats = 0x3, ///< SO statistics tracked by CP/VGT including primitives written and storage needed.
StreamoutStats1 = 0x4, ///< SO1 statistics tracked by CP/VGT including primitives written and storage needed.
StreamoutStats2 = 0x5, ///< SO2 statistics tracked by CP/VGT including primitives written and storage needed.
StreamoutStats3 = 0x6, ///< SO3 statistics tracked by CP/VGT including primitives written and storage needed.
Count,
};
/// Specifies which pipeline stats should be tracked by a pipeline stats query pool.
enum QueryPipelineStatsFlags : uint32
{
QueryPipelineStatsIaVertices = 0x1, ///< Input vertices.
QueryPipelineStatsIaPrimitives = 0x2, ///< Input primitives.
QueryPipelineStatsVsInvocations = 0x4, ///< Vertex shader invocations.
QueryPipelineStatsGsInvocations = 0x8, ///< Geometry shader invocations.
QueryPipelineStatsGsPrimitives = 0x10, ///< Geometry shader primitives.
QueryPipelineStatsCInvocations = 0x20, ///< Clipper invocations.
QueryPipelineStatsCPrimitives = 0x40, ///< Clipper primitives.
QueryPipelineStatsPsInvocations = 0x80, ///< Pixel shader invocations.
QueryPipelineStatsHsInvocations = 0x100, ///< Hull shader invocations.
QueryPipelineStatsDsInvocations = 0x200, ///< Domain shader invocations.
QueryPipelineStatsCsInvocations = 0x400, ///< Compute shader invocations.
QueryPipelineStatsTsInvocations = 0x800, ///< Task shader invocations.
QueryPipelineStatsMsInvocations = 0x1000, ///< Mesh shader invocations.
QueryPipelineStatsMsPrimitives = 0x2000, ///< Mesh shader primitives.
QueryPipelineStatsAll = 0x3FFF ///< All of the above stats.
};
/// Specifies properties for @ref IQueryPool creation. Input structure to IDevice::CreateQueryPool().
struct QueryPoolCreateInfo
{
QueryPoolType queryPoolType; ///< Type of query pool to create (i.e., occlusion vs. pipeline stats).
uint32 numSlots; ///< Number of slots in the query pool.
uint32 enabledStats; ///< An ORed mask of stats flags specific to the query pool type.
/// @see QueryPipelineStatsFlags for PipelineStats query pools.
union
{
struct
{
/// If true, this query pool can have results retrieved using the CPU (using @ref IQueryPool::GetResults)
/// and can be reset using the CPU (using @ref IQueryPool::Reset). Otherwise, the client must use command
/// buffers to perform these operations (using @ref ICmdBuffer::CmdResetQueryPool and
/// @ref ICmdBuffer::CmdResolveQuery).
uint32 enableCpuAccess : 1;
uint32 reserved : 31; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed together as a uint32.
} flags; ///< Flags controlling QueryPool behavior.
};
/// Controls operations that compute query results.
enum QueryResultFlags : uint32
{
QueryResultDefault = 0x0, ///< Default to 32-bit results with no waiting.
QueryResult64Bit = 0x1, ///< Store all results as 64-bit values.
QueryResultWait = 0x2, ///< Wait for the queries to finish when computing the results.
QueryResultAvailability = 0x4, ///< If the results of a query are available at computation time a one will be
/// written as a separate value after the result value, if the results were not
/// available a zero will be written.
QueryResultPartial = 0x8, ///< If the final result of a query would be unavailable, then return a
/// result for that query between 0 and what the final result would be.
QueryResultAccumulate = 0x10, ///< Results are added to the values present in the destination, if availability
/// data is enabled it will be ANDed with the present availability data.
QueryResultPreferShaderPath = 0x20, ///< Prefer a shader resolve path over a command processor path.
QueryResultOnlyPrimNeeded = 0x40, ///< Select only primitives storage needed in Streamout query results
QueryResultAll = 0x7F ///< Clients should NOT use it, for internal static_assert purpose only.
};
/**
***********************************************************************************************************************
* @interface IQueryPool
* @brief Represents a set of queries that can be used to retrieve detailed info about the GPU's execution of a
* particular range of a command buffer.
*
* Currently, only occlusion queries and pipeline statistic queries are supported. All queries in a pool are the same
* type.
*
* @see IDevice::CreateQueryPool()
***********************************************************************************************************************
*/
class IQueryPool : public IGpuMemoryBindable
{
public:
/// Retrieves query results from a query pool.
///
/// Multiple consecutive query results can be retrieved with one call.
///
/// @param [in] flags Flags that control the result data layout and how the results are retrieved.
/// @param [in] queryType Specifies what data the query slots must produce.
/// @param [in] startQuery First query pool slot to retrieve data for.
/// @param [in] queryCount Number of query pool slots to retrieve data for.
/// @param [in] pMappedGpuAddr Specify the query buffer mapped address. If the parameter equals nullptr,
// this method will use Map\UnMap to access the data.
/// @param [in,out] pDataSize Input value specifies the available size in pData in bytes; output value reports the
/// number of bytes required to hold all result data.
/// @param [out] pData Location where the query results should be written. Can be null in order to query the
/// required size. The data returned depends on the query pool type and flags. All data
/// entries are either uint32 or uint64 integers. One or more type-specific entries will
/// be optionally followed by one entry for availability. The type-specific data is:<br>
/// + QueryOcclusion: One entry to store the zPass count.
/// + QueryPipelineStats: One entry per statistic enabled in the create info. The stats
/// will be written in the appropriate order for each PAL client.
/// @param [in] stride Stride in bytes between subsequent query result data or zero to request tightly
/// packed result data.
///
/// @returns Success if query results were successfully returned in pData, or NotReady if any of the requested query
/// slots does not yet have results available. Otherwise, one of the following error codes may be
/// returned:
/// + ErrorInvalidValue if the range defined by startQuery and queryCount is not valid for this query pool.
/// + ErrorGpuMemoryNotBound if the query pool requires GPU memory but none is bound.
/// + ErrorInvalidMemorySize if pData is non-null and the value stored in pDataSize is too small.
virtual Result GetResults(
QueryResultFlags flags,
QueryType queryType,
uint32 startQuery,
uint32 queryCount,
const void* pMappedGpuAddr,
size_t* pDataSize,
void* pData,
size_t stride) = 0;
/// Use CPU to reset the query pool slots.
///
/// Supported for occlusion and video decode statistics query pools.
///
/// @param [in] startQuery First query pool slot to reset.
/// @param [in] queryCount Number of query pool slots to reset.
/// @param [in] pMappedCpuAddr Specify the query buffer mapped address. If the parameter equals nullptr,
// this method will use Map/UnMap to access the data.
///
/// @returns Success if the reset was successfully performed.
virtual Result Reset(
uint32 startQuery,
uint32 queryCount,
void* pMappedCpuAddr) = 0;
/// Returns the distance, in bytes, between successive query slots in the bound GPU memory.
/// This method is only supported for @ref QueryPoolType::VideoDecodeStats
///
/// @returns the distance, in bytes, between successive query slots in the bound GPU memory.
virtual gpusize GetQuerySlotStride() const = 0;
/// Returns the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @returns Pointer to client data.
void* GetClientData() const
{
return m_pClientData;
}
/// Sets the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @param [in] pClientData A pointer to arbitrary client data.
void SetClientData(
void* pClientData)
{
m_pClientData = pClientData;
}
protected:
/// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly
/// called the proper create method.
IQueryPool() : m_pClientData(nullptr) {}
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~IQueryPool() { }
private:
/// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
/// and set via SetClientData().
/// For non-top-layer objects, this will point to the layer above the current object.
void* m_pClientData;
};
} // Pal
@@ -0,0 +1,765 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palQueue.h
* @brief Defines the Platform Abstraction Library (PAL) IQueue interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palDestroyable.h"
#include "palEvent.h"
namespace Pal
{
// Forward declarations.
class ICmdBuffer;
class IFence;
class IGpuMemory;
class IImage;
class IPrivateScreen;
class IQueueSemaphore;
class IScreen;
class ISwapChain;
struct CmdBufInfo;
struct GpuMemSubAllocInfo;
struct GpuMemoryRef;
struct DoppRef;
enum class VirtualGpuMemAccessMode : uint32;
/// Specifies whether presents are windowed or fullscreen. This will determine whether the present is performed via a
/// BLT or flip.
enum class PresentMode : uint32
{
Unknown,
Windowed,
Fullscreen,
Count
};
/// Enumerates the possible overrides for the flip interval.
enum class FlipIntervalOverride : uint32
{
_None = 0, ///< No override.
Immediate = 1, ///< Zero frames of flip latency.
ImmediateAllowTearing = 2, ///< Same as Immediate, but allows tearing (no vsync).
One = 3, ///< One frame of flip latency.
Two = 4, ///< Two frames of flip latency.
Three = 5, ///< Three frames of flip latency.
Four = 6, ///< Four frames of flip latency.
};
/// Defines flags for describing which types of present modes are supported on a given queue.
enum PresentModeSupport : uint32
{
SupportWindowedPresent = 0x1,
SupportWindowedPriorBlitPresent = 0x2,
SupportFullscreenPresent = 0x4,
};
/// Defines submit-time bottlenecks which PAL can potentially optimize.
enum class SubmitOptMode : uint32
{
Default = 0, ///< PAL will enable optimizations when generally efficient.
Disabled = 1, ///< Disable all optimizations that could be detrimental in special cases.
MinKernelSubmits = 2, ///< Minimize the overhead of launching command buffers on the CPU and GPU.
MinGpuCmdOverhead = 3, ///< Minimize the overhead of reading command buffer commands on the GPU.
Count
};
/// Enumerates vcn instance affinity statuses
enum MmAffinityStatus : uint32
{
MmAffinityNotAllowed = 0, ///< The specific vcn instance can't be used.
MmAffinityAllowed = 1 ///< The specific vcn instance can be used.
};
/// Union describes all vcn instance affinity status.
union MmAffinity
{
struct
{
uint32 vcn0Affinity : 2; ///< Affinity for instance vcn0
uint32 vcn1Affinity : 2; ///< Affinity for instance vcn1
uint32 reserved : 28; ///< Reserved (all 0)
};
uint32 u32All;
};
/// Structure describing dump information for a command buffer.
struct CmdBufferDumpDesc
{
EngineType engineType; ///< The engine type that this buffer is targeted for.
QueueType queueType; ///< The type of queue that this buffer is being created on.
SubEngineType subEngineType; ///< The ID of which sub-engine that this buffer is made for.
uint32 cmdBufferIdx; ///< The index into the SubmitInfo ppCmdBuffers array that this
/// command buffer dump came from.
union
{
struct
{
uint8 isPreamble : 1; ///< Set if the buffer is an internal preamble command buffer.
uint8 isPostamble : 1; ///< Set if the buffer is an internal postamble command buffer.
uint8 reserved : 6; ///< Reserved for future use.
};
uint8 u32All; ///< Flags packed as 8-bit uint.
} flags;
};
/// Structure describing a command buffer chunk for use while dumping command buffers.
struct CmdBufferChunkDumpDesc
{
uint32 id; ///< ID (number) of this command chunk within the command buffer.
const void* pCommands; ///< Pointer to the command data.
size_t size; ///< Size of valid data in bytes pointed to in pCommands.
};
/// Definition for command buffer dumping callback.
///
/// @param [in] cmdBufferDesc Description of the command buffer.
/// @param [in] pChunks Pointer to an array of command buffer chunk descriptions.
/// @param [in] numChunks The number of chunks pointed to in pChunks.
typedef void (PAL_STDCALL* CmdDumpCallback)(
const CmdBufferDumpDesc& cmdBufferDesc,
const CmdBufferChunkDumpDesc* pChunks,
uint32 numChunks,
void* pUserData);
/// Specifies properties for @ref IQueue creation. Input structure to IDevice::CreateQueue().
struct QueueCreateInfo
{
QueueType queueType; ///< Selects which type of queue to create.
EngineType engineType; ///< Selects which type of engine to create.
uint32 engineIndex; ///< Which instance of the specified engine type to query. For example, there
/// can be multiple compute queues, so this parameter distinguished between them.
SubmitOptMode submitOptMode; ///< A hint telling PAL which submit-time bottlenecks should be optimized, if any.
QueuePriority priority; ///< A hint telling PAL to create queue with proper priority.
/// It is only supported if supportQueuePriority is set in DeviceProperties.
/// In Linux, if we don't have root privilege, the creation with above-Medium
/// priority will fail. Client should take the corresponding action like retry
/// with lower priority, if necessary.
struct
{
uint32 aqlQueue : 1; ///< Compute queue will process AQL packets and kernels
uint32 windowedPriorBlit : 1; ///< All windowed presents on this queue are notifications
/// that the client has manually done a blit present
uint32 tmzOnly : 1; ///< This queue allows only TMZ submissions. Required for
/// compute TMZ submits.
#if PAL_AMDGPU_BUILD
uint32 enableGpuMemoryPriorities : 1; ///< Enables support for GPU memory priorities on this Queue.
/// This is optional because enabling the feature requires
/// a small amount of memory overhead per-Queue for
/// bookkeeping purposes.
#else
uint32 placeholder2 : 1; ///< Reserved field. Set to 0.
#endif
uint32 dispatchTunneling : 1; ///< This queue uses compute dispatch tunneling.
uint32 forceWaitIdleOnRingResize : 1; ///< This queue need to wait for idle before resize RingSet.
/// This is intended as a workaround for misbehaving applications.
#if defined(_WIN32)
uint32 nullRendering : 1; ///< Setting this bit makes this queue behave like IfhModeKmd.
#else
uint32 placeholder3 : 1; ///< Reserved field. Set to 0.
#endif
uint32 reserved : 25; ///< Reserved for future use.
};
uint32 numReservedCu; ///< The number of reserved compute units for RT CU queue
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914
uint32 persistentCeRamOffset; ///< Byte offset to the beginning of the region of CE RAM which this Queue should
/// preserve across consecutive submissions. Must be a multiple of 32. It is an
/// error to specify a nonzero value here if the the Device does not support
/// @ref supportPersistentCeRam for the Engine this Queue will attach to.
uint32 persistentCeRamSize; ///< Amount of CE RAM space which this Queue should preserve across consecutive
/// submissions. Units are in DWORDs, and this must be a multiple of 8. It is an
/// error to specify a nonzero value here if the the Device does not support
/// @ref supportPersistentCeRam for the Engine this Queue will attach to.
#endif
uintptr_t aqlPacketList; ///< Location of the HIP runtime's info about this queue
};
/// Specifies the portion of @ref SubmitInfo that is specific to each sub-queue in a multi-queue object (@see
/// IDevice::CreateMultiQueue). Effectively, this enables specifying a different set of command buffers for each
/// queue that makes up a gang submission to a multi-queue object.
struct PerSubQueueSubmitInfo
{
uint32 cmdBufferCount; ///< Number of command buffers to be submitted (can be 0 if this submit doesn't
/// involve work for the relevant queue).
ICmdBuffer*const* ppCmdBuffers; ///< Array of cmdBufferCount command buffers to be submitted. Command buffers
/// that are part of a ganged submit must guarantee the conditions required
/// for the optimizeExclusiveSubmit flag.
const CmdBufInfo* pCmdBufInfoList; ///< Null, or an array of cmdBufferCount structs providing additional
/// info about the command buffers being submitted. If non-null,
/// elements are ignored if their isValid flag is false.
};
/// Specifies all information needed to execute a set of command buffers. Input structure to IQueue::Submit().
///
/// Some members of this structure are not supported on all platforms. The client must check the appropriate properties
/// structures to determine if the corresponding features are supported:
/// + pGpuMemoryRefs: Support is indicated by supportPerSubmitMemRefs in @ref DeviceProperties.
/// + ppBlockIfFlipping: Support is indicated by supportBlockIfFlipping in @ref PlatformProperties. If it is supported,
/// the client must not specify a blockIfFlippingCount greater than MaxBlockIfFlippingCount.
///
/// @note If this queue is running in physical submission mode (due to hardware restrictions), the gpuMemRefCount and
/// pGpuMemoryRefs arguments to this method are ignored because the command buffers themselves contain their own
/// GPU memory reference lists.
struct MultiSubmitInfo
{
const PerSubQueueSubmitInfo* pPerSubQueueInfo;///< Specifies per-subqueue information for the submit. Typically
/// this is a pointer to a single entry specifying the command
/// buffers to be submitted on this queue. For gang submission on
/// a multi-queue, this should be an array with one entry per
/// sub-queue. The array size must be less than or equal to the
/// queueCount specified when the multi-queue was created and
/// the workload specified in each entry will be assigned to the
/// corresponding sub-queue. It is valid to have a cmdBufferCount
/// of 0 for sub-queues without work. Can be null if perSubQueueInfo-
/// Count is 0.
uint32 perSubQueueInfoCount; ///< Number of PerSubqueueSubmitInfo to be submitted. Can be zero if
/// there is no work to submit.
uint32 gpuMemRefCount; ///< Number of GPU memory references for this submit.
const GpuMemoryRef* pGpuMemoryRefs; ///< Array of gpuMemRefCount GPU memory references. Can be null if
/// gpuMemRefCount is zero. The GPU memory objects will be made
/// resident for the duration of this submit.
uint32 doppRefCount; ///< Number of DOPP desktop texture references for this submit.
const DoppRef* pDoppRefs; ///< Array of doppRefCount DOPP texture references. Can be null if
/// doppRefCount is zero.
uint32 externPhysMemCount; ///< Number of entries in ppExternPhysMem.
const IGpuMemory** ppExternPhysMem; ///< Array of external physical memory allocations to be initialized
/// as part of this submit. The first submit that references a
/// particular external physical memory allocation must include
/// that allocation in this list. Subsequent submits that reference
/// the same allocation should not include it in this list, as it
/// would trigger redundant GPU page table initialization.
uint32 blockIfFlippingCount; ///< Number of GPU memory objects to protect when flipped.
const IGpuMemory*const* ppBlockIfFlipping; ///< Array of blockIfFlippingCount GPU memory objects. Can be null if
/// blockIfFlippingCount is zero. The command buffers will not be
/// scheduled to the GPU while a fullscreen (flip) present is queued
/// for any of these GPU memory allocations.
uint32 fenceCount; ///< Number of fence objects to be signaled once the last command buffer
/// in this submission completes execution.
IFence** ppFences; ///< Array of fence objects. Can be null if fenceCount is zero.
CmdDumpCallback pfnCmdDumpCb; ///< Null, or a callback function to handle the dumping of the
/// command buffers used in this submit.
void* pUserData; ///< Client provided data to be passed to callback.
uint32 stackSizeInDwords; ///< 0, or the max of stack frame size for indirect shaders of the
/// pipelines referenced in the command buffers of this submission.
/// The size is per native thread. So that the client will have to
/// multiply by 2 if a Wave64 shader that needs scratch is used.
/// Note that the size will not shrink for the lifetime of the queue
/// once it is grown and only affects compute scratch ring.
const IGpuMemory* pFreeMuxMemory; ///< The gpu memory object of the private flip primary surface for the
/// FreeMux feature.
};
typedef MultiSubmitInfo SubmitInfo;
/// The value of blockIfFlippingCount in @ref SubmitInfo cannot be greater than this value.
constexpr uint32 MaxBlockIfFlippingCount = 16;
/// Specifies properties for the presentation of an image to the screen. Input structure to IQueue::PresentDirect().
struct PresentDirectInfo
{
union
{
struct
{
uint32 fullscreenDoNotWait : 1; ///< Fail the present immediately if the present queue is full.
uint32 srcIsTypedBuffer : 1; ///< True if the source is a typed buffer instead of an image.
uint32 dstIsTypedBuffer : 1; ///< True if the destination is a typed buffer instead of an image.
uint32 notifyOnly : 1; ///< Indicates that a present occurred outside of PAL. PAL must not
/// execute a present if this is true but may update internal
/// tracking state.
uint32 reserved : 28; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< Present flags.
OsWindowHandle hWindow; ///< Native OS window handle that this image should be presented to.
PresentMode presentMode; ///< Chooses between windowed and fullscreen present.
uint32 presentInterval; ///< Must be an integer from 0 to 4. 0 indicates that the present should
/// occur immediately (may tear), and 1-4 indicates the present should
/// occur after 1 to 4 vertical syncs. Only valid for fullscreen presents.
union
{
IImage* pSrcImage; ///< Optional: The image to be presented. If null, the present will not
/// occur but PAL may still call into the OS on certain platforms that
/// expect it.
IGpuMemory* pSrcTypedBuffer; ///< The typed buffer to be presented. If null, the present will not occur
/// but PAL may still call into the OS on certain platforms that expect it.
};
union
{
IImage* pDstImage; ///< Optional: copy from the source image to this image. If null, PAL will
/// automatically copy into the appropriate platform-specific destination.
/// This is only supported for windowed mode presents.
IGpuMemory* pDstTypedBuffer; ///< The typed buffer to be presented. If null, the present will not occur
/// but PAL may still call into the OS on certain platforms that expect it.
};
};
/// Media stream counter information.
struct MscInfo
{
uint64 targetMsc; ///< if the current MSC is less than <targetMsc>, the buffer swap
///< will occur when the MSC value becomes equal to <targetMsc>
uint64 divisor; ///< Divisor
///< the buffer swap will occur the next time the MSC value is
///< incremented to a value such that MSC % <divisor> = <remainder>
///< if the current MSC is greater than or equal to <targetMsc>
uint64 remainder; ///< Remainder
};
/// Specifies properties for the presentation of an image to the screen. Input structure to IQueue::PresentSwapChain().
struct PresentSwapChainInfo
{
PresentMode presentMode; ///< Chooses between windowed and fullscreen present.
IImage* pSrcImage; ///< The image to be presented.
ISwapChain* pSwapChain; ///< The swap chain associated with the source image.
uint32 imageIndex; ///< The index of the source image within the swap chain. Owership of this image
/// index will be released back to the swap chain if this call succeeds.
uint32 rectangleCount; ///< Number of valid rectangles in the pRectangles array.
uint32 syncInterval; ///< Applicable only when syncIntervalOverride is set
/// 0 - The presentation occurs immediately, there is no synchronization.
/// 1 through 4 - Synchronize presentation after the nth vertical blank.
const Rect* pRectangles; ///< Array of rectangles defining the regions which will be updated.
uint64 presentId; ///< PresentId functions as an identifier for present operations on a swapchain.
/// If this PresentId is non-zero, then the application can later use this value
/// to refer to that image presentation. A value of zero indicates that this
/// presentation has no associated presentId. A non-zero presentId must be greater
/// than any non-zero presentId passed previously by the application for the same
/// swapchain.
union
{
struct
{
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 941
uint32 notifyOnly : 1; ///< True if it is a notify-only present
#else
uint32 notifyOnly : 1; ///< Indicates that a present occurred outside of PAL. PAL must not
/// execute a present if this is true but may update internal
/// tracking state.
#endif
uint32 isTemporaryMono : 1; ///< True if WS Stereo is enabled, but 3D display mode turned off.
uint32 turboSyncEnabled : 1; ///< Whether TurboSync is enabled.
uint32 syncIntervalOverride : 1; ///< Override default syncInterval with the value in syncInterval
/// Supported only on Windows wsiPlatforms.
uint32 reserved : 28; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< PresentSwapChainInfo flags.
#if PAL_AMDGPU_BUILD
MscInfo mscInfo; ///< Media stream counter information
#endif
};
/// Specifies a mapping from a range of pages in a virtual GPU memory object to a range of pages in a real GPU memory
/// object. Input to IQueue::RemapVirtualMemoryPages().
///
/// When mapping pages of a virtual GPU memory object to a range of pages in a real GPU memory object on a remote GPU,
/// the client must point pRealGpuMem at a peer GPU memory object created on the input queue's device instead of the
/// actual real GPU memory object created on the remote device. This is required for two reasons:
/// 1. PAL can only view remote GPU memory using peer objects.
/// 2. PAL enforces a separation of state between different IDevice object families.
///
/// virtualStartOffset and size must be aligned to the virtualMemPageSize member of @ref DeviceProperties.
/// realStartOffset must be aligned to the realMemAllocGranularity member of @ref DeviceProperties.
struct VirtualMemoryRemapRange
{
IGpuMemory* pVirtualGpuMem; ///< Virtual GPU memory object whose mapping is being updated.
gpusize virtualStartOffset; ///< Start of the page range to be updated, in bytes.
IGpuMemory* pRealGpuMem; ///< Real GPU memory object the virtual range should point at.
gpusize realStartOffset; ///< Start of the page range in the real GPU memory object, in bytes.
gpusize size; ///< Size of the mapping range, in bytes.
VirtualGpuMemAccessMode virtualAccessMode; ///< Access mode for virtual GPU memory's unmapped pages.
/// This parameter is ignored on some platforms.
};
/// Specifies a set of page mappings to copy between virtual GPU memory objects. The source and destination can be the
/// same memory object and the source and destination regions may overlap. Input to IQueue::CopyVirtualMemoryPageMappings().
///
/// srcStartOffset, dstStartOffset, and size must be aligned to the virtualMemPageSize member of @ref DeviceProperties.
struct VirtualMemoryCopyPageMappingsRange
{
IGpuMemory* pSrcGpuMem; ///< Virtual GPU memory object whose mapping is being copied from.
gpusize srcStartOffset; ///< Start of the copy source range, in bytes.
IGpuMemory* pDstGpuMem; ///< Virtual GPU memory object whose mapping is being copied to.
gpusize dstStartOffset; ///< Start of the copy destination range, in bytes.
gpusize size; ///< Size of the mapping range, in bytes.
};
/// Specifies kernel level information about a context.
struct KernelContextInfo
{
union
{
struct
{
uint32 hasDebugVmid : 1; ///< True if the context has acquired the debug vmid.
uint32 hasHighPriorityVmid : 1; ///< True if the context has acquired the high priority vmid.
uint32 reserved : 30; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< Context flags.
uint64 contextIdentifier; ///< Kernel scheduler context identifier.
};
/**
***********************************************************************************************************************
* @interface IQueue
* @brief Represents a queue of work for a particular GPU engine on a device.
*
* An IQueue object is a virtual representation of a hardware engine on the device. Multiple IQueue objects can be
* created and have work submitted on them in parallel. Work is submitted to a queue through @ref ICmdBuffer objects,
* and work can be synchronized between multiple queues using @ref IQueueSemaphore objects.
*
* @see IDevice::GetQueue()
***********************************************************************************************************************
*/
class IQueue : public IDestroyable
{
public:
/// Submits a group of root command buffers for execution on this queue.
///
/// @param [in] submitInfo Specifies all command buffers to execute along with other residency and synchronization
/// information. See @ref SubmitInfo for additional, important documentation.
///
/// @returns Success if the command buffer was successfully submitted. Otherwise, one of the following errors may
/// be returned:
/// + ErrorInvalidPointer if:
/// - any of the array inputs are null when their counts are non-zero.
/// - any members of non-null point arrays are null.
/// + ErrorTooManyMemoryReferences if the total number of memory references (device/queue global and
/// per-command buffer) is too large.
/// + ErrorInvalidValue if blockIfFlippingCount is too large.
/// + ErrorIncompleteCommandBuffer if any of the submitted command buffers are not properly constructed.
/// + ErrorIncompatibleQueue if any submitted command buffer does not match this queue's type (e.g.,
/// universal, graphics, DMA).
virtual Result Submit(
const MultiSubmitInfo& submitInfo) = 0;
/// Waits for all previous submission on this queue to complete before control is returned to the caller.
///
/// @returns Success if wait for submissions completed. Otherwise an error indicates reason for unsuccessful wait,
/// for example due to lost device.
virtual Result WaitIdle() = 0;
/// Inserts a semaphore signal into the GPU queue. The semaphore will be signaled once all previously submitted
/// work on this queue has completed.
///
/// @param [in] pQueueSemaphore Semaphore to signal.
/// @param [in] value timeline Semaphore point value to signal, ignored for non-timeline semaphores.
///
/// @returns Success if the semaphore signal was successfully queued. Otherwise, one of the following errors may be
/// returned:
/// + ErrorUnknown if the OS scheduler rejects the signal for unknown reasons.
virtual Result SignalQueueSemaphore(
IQueueSemaphore* pQueueSemaphore, uint64 value = 0) = 0;
/// Inserts a semaphore wait into the GPU queue. The queue will be stalled until the specified semaphore is
/// signaled.
///
/// @param [in] pQueueSemaphore Semaphore to wait on.
/// @param [in] value timeline semaphore point value to wait on, ignored for non-timeline semaphores.
///
/// @returns Success if the semaphore wait was successfully queued. Otherwise, one of the following errors may be
/// returned:
/// + ErrorUnknown if the OS scheduler rejects the wait for unknown reasons.
virtual Result WaitQueueSemaphore(
IQueueSemaphore* pQueueSemaphore, uint64 value = 0) = 0;
#if PAL_KMT_BUILD
/// Acquire the keyed mutex of shared GPU memory object (CPU sync) and then wait for the synchronization object of
/// the shared GPU memory object (GPU sync based on fence). Note that the shared GPU memory object has to be
/// a D3d11 resource created with (D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX | D3D11_RESOURCE_MISC_SHARED_NTHANDLE)
/// misc flag.
///
/// @param [in] pGpuMemory Shared GPU memory object on which keyed mutex and synchronization object are bound.
/// [in] key Key of keyed mutex to be acquired.
/// [in] timeout Timeout interval for keyed mutex acquiring, in milliseconds.
///
/// @ returns Success if the keyed mutex has been successfully acquired and wait for the synchronization object
/// has been successfully scheduled. Otherwise, one of the following errors may be returned:
/// + ErrorUnknown if either the keyed mutex has not been successfully acuiqred or wait for the
/// synchronization object has not been successfully scheduled.
virtual Result KeyedMutexAcquireSync(
IGpuMemory* pGpuMemory,
uint64 key,
std::chrono::milliseconds timeout) = 0;
/// Signal the synchronization object of shared GPU memory object with bumped fence value and then release the
/// keyed mutex of shared GPU memory object. Note that the shared GPU memory object has to be a D3d11 resource
/// created with (D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX | D3D11_RESOURCE_MISC_SHARED_NTHANDLE) misc flag.
///
/// @param [in] pGpuMemory Shared GPU memory object on which keyed mutex and synchronization object are
/// bound.
/// [in] key Key of keyed mutex to be released.
///
/// @ returns Success if signal of the synchronization object has been successfully scheduled and the keyed mutex
/// has been successfully released. Otherwise, one of the following errors may be returned:
/// + ErrorUnknown if either signal of the synchronization object has not been successfully scheduled
/// or keyed mutex has not been successfully released.
virtual Result KeyedMutexReleaseSync(
IGpuMemory* pGpuMemory,
uint64 key) = 0;
#endif
/// This function passes application information to KMD for application specific power optimizations.
/// Power configuration are restored to default when all application queues are destroyed.
///
/// @param [in] pFileName Application executable name
/// @param [in] pPathName Path to the application
///
/// @returns Success if the information is passed successfully. Otherwise, one of the following errors may be
/// returned:
/// + Unsupported if this function is not available on this OS or if the queue context is null.
/// + ErrorUnknown if an unexpected internal error occurs.
virtual Result UpdateAppPowerProfile(
const wchar_t* pFileName,
const wchar_t* pPathName) = 0;
/// Queues the specified image for presentation on the screen. This function directly queues the presentation
/// request based on the input parameters without special synchronization considerations like a swap chain present.
/// All previous work done on this queue will complete before the image is displayed.
///
/// This function should never be called with a swap chain presentable image because it won't release ownership of
/// the presentable image index, eventually deadlocking the swap chain.
///
/// Overall support for direct presents can be queried at platform creation time via supportNonSwapChainPresents
/// in @ref PlatformProperties. Support for particular present modes is specifed via supportedDirectPresentModes
/// in @ref DeviceProperties.
///
/// @note Any images specified in presentInfo must be made resident before calling this function.
///
/// @param [in] presentInfo Specifies the source image and destination window for the present as well as other
/// properties.
///
/// @returns Success if the present was successfully queued. Otherwise, one of the following errors may be
/// returned:
/// + ErrorInvalidValue if the flip interval is invalid.
/// + ErrorInvalidValue if the present mode doesn't match the capabilities of the image.
/// + ErrorInvalidFlags if the present flags don't match the capabilities of the image.
virtual Result PresentDirect(
const PresentDirectInfo& presentInfo) = 0;
/// Queues the specified image for presentation on the screen. This function uses the provided swap chain to
/// determine exactly how the image should be presented (e.g., can the user see tearing). See @ref ISwapChain for
/// more information on swap chain presentation. All previous work done on this queue will complete before the
/// image is displayed, but future work may execute before the present is completed because swap chain present
/// execution may be asynchronous to the queue that initiated present.
///
/// Assuming the presentInfo is valid, this function will always release ownership of the presentable image index
/// even if PAL encounters an error while executing the present.
///
/// Queue support for swap chain presents is specified via supportsSwapChainPresents in @ref DeviceProperties.
/// Support for particular PresentModes is queried per SwapChainMode via IDevice::GetSwapChainInfo().
///
/// @note The source image specified in presentInfo must be made resident before calling this function.
///
/// @param [in] presentInfo Specifies the source image, swap chain, and basic presentation information.
///
/// @returns Success if the present was successfully queued. Otherwise, one of the following errors may be
/// returned:
/// + ErrorInvalidPointer if the source image or swap chain are null.
/// + ErrorInvalidValue if the present mode doesn't match the capabilities of the image or if the image
/// index isn't valid within the swap chain.
virtual Result PresentSwapChain(
const PresentSwapChainInfo& presentInfo) = 0;
/// Inserts a delay of a specified amount of time before processing more commands on this queue.
///
/// Only available on timer queues. Useful in conjunction with queue semaphores to implement frame pacing.
///
/// @param [in] delay Time, in milliseconds, to delay before processing more commands on this queue.
///
/// @returns Success if the delay was successfully queued. Otherwise, one of the following errors may be returned:
/// + ErrorInvalidValue if delay is less than 0.
virtual Result Delay(
Util::fmilliseconds delay) = 0;
/// Inserts a delay of a specified amount of time on this queue after a vsync on a private display object.
///
/// Only available on timer queues. Useful in conjunction with queue semaphores to implement pacing of GPU and CPU
/// operations for rendering and presentation in VR as this allows GPU commands of next frame to be sent early but
/// blocks GPU execution until after vsync.
///
/// @param [in] delay Time, in microseconds, to delay before processing more commands on this queue.
/// @param [in] pScreen The private screen object that the vsync is occurring and the delay is waiting on.
///
/// @returns Success if the delay was successfully queued. Otherwise, one of the following errors may be returned:
/// + ErrorInvalidValue if delay is less than 0.
virtual Result DelayAfterVsync(
Util::fmicroseconds delay,
const IPrivateScreen* pScreen) = 0;
/// Updates page mappings for virtual GPU memory allocations.
///
/// @param [in] rangeCount Number of ranges to remap (i.e., size of the pRanges array).
/// @param [in] pRanges Defines the set of remappings from virtual GPU memory object pages to real GPU
/// memory object pages.
/// @param [in] doNotWait If true, then this paging operation will be executed on the Queue immediately, without
/// waiting for any previous rendering to finish first. On platforms that don't support
/// this, the flag will be ignored.
/// @param [in] pFence Optional. Pointer to an IFence, which will be signaled after the VA remapping.
///
/// @returns Success if the remappings were executed successfully. It is assumed that the following conditions are
/// met for the input to this function:
/// + rangeCount is not 0.
/// + The page range for all members of pRanges are valid.
/// + pRanges is not null.
/// + pVirtualGpuMem is not null for any member of pRanges.
/// + pRanges does not specify a real GPU memory object as a virtual GPU memory object or vice versa.
virtual Result RemapVirtualMemoryPages(
uint32 rangeCount,
const VirtualMemoryRemapRange* pRanges,
bool doNotWait,
IFence* pFence) = 0;
/// Copies page mappings from one virtual GPU memory object to another.
///
/// @param [in] rangeCount Number of ranges to copy (i.e., size of the pRanges array).
/// @param [in] pRanges Defines the set of page mappings to copy between virtual GPU memory objects.
/// @param [in] doNotWait If true, then this paging operation will be executed on the Queue immediately, without
/// waiting for any previous rendering to finish first. On platforms that don't support
/// this, the flag will be ignored.
///
/// @returns Success if the mappings were copied successfully. It is assumed that the following conditions are
/// met for the input to this function:
/// + rangeCount is not 0.
/// + The page range for all members of pRanges are valid.
/// + pRanges is not null.
/// + pSrcGpuMem or pDstGpuMem is not null for any member of pRanges.
/// + pRanges does not specify a real GPU memory object as source or destination
virtual Result CopyVirtualMemoryPageMappings(
uint32 rangeCount,
const VirtualMemoryCopyPageMappingsRange* pRanges,
bool doNotWait) = 0;
/// Associates the provided Fence object with the last submission on this queue object. The Fence can be used via
/// GetStatus() to get the status of the last Submit, however no event will be created/set for the Fence so
/// WaitForFences() should NOT be called on the fence after this association.
///
/// @see IFence::GetStatus()
/// @see IFence::WaitForFences()
///
/// @param [in] pFence Fence object to be associated with the last Submit on this queue
///
/// @returns Success if the association was successful. ErrorUnavailable will be returned in there has not yet been
/// a Submit on this queue.
virtual Result AssociateFenceWithLastSubmit(
IFence* pFence) = 0;
/// Set execution priority for the current queue, it allows to elevate execution priority of submitted command
/// buffers, but it has no effect on command buffers that have already been submitted for execution. Elevating
/// the queue priority to medium or high would allow to temporary stall a low priority queue execution and execute
/// its work as soon as the low priority queue starts draining.
///
/// @param [in] priority The priority level of the queue.
virtual void SetExecutionPriority(
QueuePriority priority) = 0;
/// Returns a list of GPU memory allocations used by this queue.
///
/// @param [in,out] pNumEntries Input value specifies the available size in pAllocInfoList; output value
/// reports the number of GPU memory allocations.
/// @param [out] pAllocInfoList If pAllocInfoList=nullptr, then pNumEntries is ignored on input. On output it
/// will reflect the number of allocations that make up this queue. If
/// pAllocInfoList!=nullptr, then on input pNumEntries is assumed to be the number
/// of entries in the pAllocInfoList array. On output, pNumEntries reflects the
/// number of entries in pAllocInfoList that are valid.
/// @returns Success if the allocation info was successfully written to the buffer.
/// + ErrorInvalidValue if the caller provides a buffer size that is different from the size needed.
/// + ErrorInvalidPointer if pNumEntries is nullptr.
virtual Result QueryAllocationInfo(
size_t* pNumEntries,
GpuMemSubAllocInfo* const pAllocInfoList) = 0;
/// Returns the QueueType for the queue
virtual QueueType Type() const = 0;
/// Returns the EngineType for the queue
virtual EngineType GetEngineType() const = 0;
/// Queries the kernel context info associated with this queue and copies it into pKernelContextInfo.
///
/// Only supported on Windows platforms.
///
/// @param [out] pKernelContextInfo Pointer to a KernelContextInfo struct to copy the information into.
/// @returns Success if the information is successfully copied into the output struct.
/// + ErrorInvalidPointer if pKernelContextInfo is nullptr.
/// + ErrorUnavailable if kernel context information is not available on the current platform.
virtual Result QueryKernelContextInfo(KernelContextInfo* pKernelContextInfo) const = 0;
/// Returns the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @returns Pointer to client data.
void* GetClientData() const
{
return m_pClientData;
}
/// Sets the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @param [in] pClientData A pointer to arbitrary client data.
void SetClientData(
void* pClientData)
{
m_pClientData = pClientData;
}
protected:
/// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly
/// called the proper create method.
IQueue() : m_pClientData(nullptr) {}
/// @internal Destructor. Prevent use of delete operator on this interface. Queues will be destroyed when the
/// associated device is destroyed.
virtual ~IQueue() { }
private:
/// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
/// and set via SetClientData().
/// For non-top-layer objects, this will point to the layer above the current object.
void* m_pClientData;
};
} // Pal
@@ -0,0 +1,275 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palQueueSemaphore.h
* @brief Defines the Platform Abstraction Library (PAL) IQueueSemaphore interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palDestroyable.h"
#include <chrono>
#if defined(_WIN32)
struct _SECURITY_ATTRIBUTES;
#endif
namespace Pal
{
// Forward declarations.
class IQueueSemaphore;
/// Specifies properties for @ref IQueueSemaphore creation. Input structure to IDevice::CreateQueueSemaphore().
struct QueueSemaphoreCreateInfo
{
union
{
struct
{
/// This queue semaphore may be opened for use by a different device.
/// For DX12 native fence, the flag needs to be consistent with D3DDDI_SYNCHRONIZATIONOBJECT_FLAGS.Shared
/// given by DX runtime.
uint32 shareable : 1;
/// This queue semaphore can only be shared through Nt handle.
/// For DX12 native fence, the flag needs to be consistent with
/// D3DDDI_SYNCHRONIZATIONOBJECT_FLAGS.NtSecuritySharing given by DX runtime.
uint32 sharedViaNtHandle : 1;
uint32 externalOpened : 1; ///< Semaphore was created by other APIs
/// This queue semaphore is a timeline semaphore. Timeline semaphores have a 64-bit unsigned integer payload
/// which gets monotonically increased with each Signal operation. A wait on a timeline semaphore blocks the
/// waiter until the specified payload value has been signaled.
/// For DX12 native fence, runtime determines initialCount. Therefore, timeline flag has to be set.
uint32 timeline : 1;
/// Do not signal the queue semaphore to max if the device is lost.
/// For DX12 native fence, the flag needs to be consistent with
/// D3DDDI_SYNCHRONIZATIONOBJECT_FLAGS.NoSignalMaxValueOnTdr given by DX runtime.
uint32 noSignalOnDeviceLost : 1;
/// For native fence only. If it's 0x0, the native fence type is D3DDDI_NATIVEFENCE_TYPE_DEFAULT.
/// If it's 0x1, native fence type is D3DDDI_NATIVEFENCE_TYPE_INTRA_GPU.
/// For DX12, the value is determined by runtime. DXCP needs to set it by reading D3DDDI_NATIVEFENCEINFO.
uint32 gpuOnly : 1;
/// This queue semaphore will be a monitored fence if this flag set, even if OS supports native fence.
uint32 forceUseMonitoredFence : 1;
uint32 reserved : 25; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< Queue semaphore creation flags.
uint32 maxCount; ///< The maximum signal count; once reached, further signals are dropped. Must be
/// non-zero and no more than maxSemaphoreCount in @ref DeviceProperties. For
/// example, a value of one would request a binary semaphore.
/// NOTE: maxCount does not apply to timeline semaphores.
uint64 initialCount; ///< Initial value for timeline semaphores. (or)
/// Initial count value for counting semaphores.
/// Must not be larger than maxCount for counting semaphores.
/// For DX12 native fence, DXCP needs to pass InitialFenceValue from
/// D3DDDI_NATIVEFENCEINFO.
};
/// Specifies parameters for opening a queue semaphore for use on another device. Input structure to
/// IDevice::OpenSharedQueueSemaphore().
struct QueueSemaphoreOpenInfo
{
/// Shared queue semaphore object from another device to be opened.
IQueueSemaphore* pSharedQueueSemaphore;
};
/// Specifies parameters for opening a queue semaphore created by other APIs such as D3D.
struct ExternalQueueSemaphoreOpenInfo
{
union
{
struct
{
uint32 crossProcess : 1; ///< This semaphore is created in another process.
uint32 sharedViaNtHandle : 1; ///< The shared semaphore handle is NT handle.
uint32 isReference : 1; ///< If set, then the opened semaphore will reference the same sync
///< object in the kernel. Otherwise, the object is copied to the
///< new Semaphore.
/// This queue semaphore is a timeline semaphore. Timeline semaphores have a 64-bit unsigned integer payload
/// which gets monotonically increased with each Signal operation. A wait on a timeline semaphore blocks the
/// waiter until the specified payload value has been signaled.
uint32 timeline : 1;
uint32 reserved : 28; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< External queue semaphore open flags.
OsExternalHandle externalSemaphore; ///< External shared semaphore handle.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 882
#if defined(__unix__) && PAL_KMT_BUILD
uint64 syncFdSignalValue; ///< Signal timeline value when importing the state of a sync file
#endif
#endif
};
/// Specifies parameters for exporting a queue semaphore. Input structure to IQueueSemaphore::ExportExternalHandle().
struct QueueSemaphoreExportInfo
{
union
{
struct
{
uint32 isReference : 1; ///< If set, then the semaphore exporting a handle that reference the
///< same sync object in the kernel. Otherwise, the object is copied
///< to the new Semaphore.
uint32 reserved : 31; ///< Resevered for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< External queue semaphore export flags.
#if PAL_KMT_BUILD
const _SECURITY_ATTRIBUTES* pSecurityAttributes; ///< It specifies the security descriptor and the inheritable
/// attribute.
const wchar_t* pNtObjectName; ///< A name to NT handle, if the object is exported as a NT
/// handle with a name, and then the handle can be acquired
/// via this name.
uint32 accessFlags; ///< Desried access rights of GPU memory.
#if defined(__unix__)
uint64 syncFdWaitValue; ///< Wait timeline value when exporting the state of a sync file
#endif
#endif
};
/**
***********************************************************************************************************************
* @interface IQueueSemaphore
* @brief Semaphore object used to synchronize GPU work performed by multiple, parallel queues.
*
* These semaphores are used by calling IQueue::SignalQueueSemaphore() and IQueue::WaitQueueSemaphore().
*
* @see IDevice::CreateQueueSemaphore()
* @see IDevice::OpenSharedQueueSemaphore()
***********************************************************************************************************************
*/
class IQueueSemaphore : public IDestroyable
{
public:
/// An IQueue::WaitQueueSemaphore operation may need to be sent down to the OS after the corresponding
/// IQueue::SignalQueueSemaphore operation due to GPU scheduler limitations. This method checks if any queues have
/// batched-up commands waiting for a SignalQueueSemaphore operation to appear.
///
/// @returns True if one or more queues have some number of commands batched-up waiting for other queues to signal
/// this semaphore. False otherwise.
virtual bool HasStalledQueues() = 0;
/// Query timeline Semaphore payload
///
/// @param [out] pValue returned payload from querying
///
/// @returns Success if the timeline semaphore is queried successful. Otherwise, one of the following errors may
/// be returned:
/// + ErrorInvalidValue if an unexpected conversion error occurs.
/// + ErrorInvalidObjectType if semaphore is non-timeline type.
virtual Result QuerySemaphoreValue(
uint64* pValue) = 0;
/// Wait on timeline Semaphore points, to be clarified, this is a CPU wait.
///
/// @param [in] value Indicate which point to be waited.
/// @param [in] timeout the max waiting time, timeout is the timeout period in units of nanoseconds.
///
/// @returns Success if the timeline semaphore point is waited successful. Otherwise, one of the following errors
/// may be returned:
/// + ErrorInvalidValue if an unexpected conversion error occurs.
/// + ErrorInvalidObjectType if semaphore is non-timeline type.
virtual Result WaitSemaphoreValue(
uint64 value,
std::chrono::nanoseconds timeout) = 0;
/// Signal on timeline Semaphore points, to be clarified, this is a CPU signal.
///
/// @param [in] value Indicate which point to be signaled.
///
/// @returns Success if the timeline semaphore point is signaled successful. Otherwise, one of the following errors
/// may be returned:
/// + ErrorInvalidValue if an unexpected conversion error occurs.
/// + ErrorInvalidObjectType if semaphore is non-timeline type.
virtual Result SignalSemaphoreValue(
uint64 value) = 0;
#if PAL_KMT_BUILD || PAL_AMDGPU_BUILD
/// Returns an OS-specific handle which can be used to refer to this semaphore object across processes. This will
/// return a null or invalid handle if the object was not created with the external create flag set.
///
/// @param [in] exportInfo Information describing how the Semamphore handle should be exported.
/// @note This function is only available for Linux builds.
///
/// @returns An OS-specific handle which can be used to access the semaphore object across processes.
virtual OsExternalHandle ExportExternalHandle(
const QueueSemaphoreExportInfo& exportInfo) const = 0;
#endif
#if defined(_WIN32)
/// Returns an OS-specific handle which can be used by another device to access the semaphore object.
///
/// @returns An OS-specific handle which can be used by another device to access the semaphore object.
virtual OsExternalHandle ExportKmtHandle() const = 0;
#endif
/// Returns the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @returns Pointer to client data.
void* GetClientData() const
{
return m_pClientData;
}
/// Sets the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @param [in] pClientData A pointer to arbitrary client data.
void SetClientData(
void* pClientData)
{
m_pClientData = pClientData;
}
protected:
/// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly
/// called the proper create method.
IQueueSemaphore() : m_pClientData(nullptr) {}
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~IQueueSemaphore() { }
private:
/// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
/// and set via SetClientData().
/// For non-top-layer objects, this will point to the layer above the current object.
void* m_pClientData;
};
} // Pal
@@ -0,0 +1,251 @@
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palShaderLibrary.h
* @brief Defines the Platform Abstraction Library (PAL) IShaderLibrary interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palDestroyable.h"
#include "palStringView.h"
#include "palSpan.h"
namespace Pal
{
struct GpuMemSubAllocInfo;
/// Common flags controlling creation of shader libraries.
union LibraryCreateFlags
{
struct
{
uint32 clientInternal : 1; ///< Internal library not created by the application.
uint32 isGraphics : 1; ///< Whether it is a graphics library
uint32 reserved : 30; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
};
/// Specifies properties about an indirect function belonging to a @ref IShaderLibrary object. Part of the input
/// structure to IDevice::CreateShaderLibrary().
struct ShaderLibraryFunctionInfo
{
Util::StringView<char> symbolName; ///< ELF Symbol name for the associated function.
gpusize gpuVirtAddr; ///< [out] GPU virtual address of the function. This is computed by PAL during
/// library creation.
};
/// Specifies a shader sub type / ShaderKind.
enum class ShaderSubType : uint32
{
Unknown = 0,
Traversal,
RayGeneration,
Intersection,
AnyHit,
ClosestHit,
Miss,
Callable,
LaunchKernel, ///< Raytracing launch kernel
Count
};
/// Specifies properties for creation of a compute @ref IShaderLibrary object. Input structure to
/// IDevice::CreateShaderLibrary().
struct ShaderLibraryCreateInfo
{
LibraryCreateFlags flags; ///< Library creation flags
const void* pCodeObject; ///< Pointer to code-object ELF binary implementing the Pipeline ABI interface.
/// The code-object ELF contains pre-compiled shaders, register values, and
/// additional metadata.
size_t codeObjectSize; ///< Size of code object in bytes.
};
/// Reports properties of a compiled library.
struct LibraryInfo
{
PipelineHash internalLibraryHash; ///< 128-bit identifier extracted from this library's ELF binary, composed of
/// the state the compiler decided was appropriate to identify the compiled
/// library. The lower 64 bits are "stable"; the upper 64 bits are "unique".
};
/// Reports shader stats. Multiple bits set in the shader stage mask indicates that multiple shaders have been combined
/// due to HW support. The same information will be repeated for both the constituent shaders in this case.
struct ShaderLibStats
{
ShaderHash shaderHash; ///< Shader hash.
CommonShaderStats common; ///< The shader compilation parameters for this shader.
/// Maximum number of VGPRs the compiler was allowed to use for this shader. This limit will be the minimum
/// of any architectural restriction and any client-requested limit intended to increase the number of waves in
/// flight.
uint32 numAvailableVgprs;
/// Maximum number of SGPRs the compiler was allowed to use for this shader. This limit will be the minimum
/// of any architectural restriction and any client-requested limit intended to increase the number of waves in
/// flight.
uint32 numAvailableSgprs;
size_t isaSizeInBytes; ///< Size of the shader ISA disassembly for this shader.
PipelineHash palInternalLibraryHash; ///< Internal hash of the shader compilation data used by PAL.
uint32 stackFrameSizeInBytes; ///< Shader function stack frame size
ShaderSubType shaderSubType; ///< ShaderSubType / Shader Kind
CompilerStackSizes cpsStackSizes; ///< Stack used in Continuation
};
/**
***********************************************************************************************************************
* @interface IShaderLibrary
* @brief Object containing one or more shader functions stored in GPU memory. These shader functions are callable
* from the shaders contained within IPipeline objects.
*
* Before a pipeline which calls into this library is bound to a command buffer (using @ref ICmdBuffer::BindPipeline),
* the client must call @ref IPipeline::LinkWithLibraries() and specify this library in the list of linked libraries.
* Failure to comply with this requirement is an error and will result in undefined behavior.
*
* @see IDevice::CreateShaderLibrary()
* @see IPipeline::LinkWithLibraries()
***********************************************************************************************************************
*/
class IShaderLibrary : public IDestroyable
{
public:
/// Returns properties of this library and its corresponding shader functions.
///
/// @returns Property structure describing this library.
virtual const LibraryInfo& GetInfo() const = 0;
/// Returns a list of GPU memory allocations used by this library.
///
/// @param [in,out] pNumEntries Input value specifies the available size in pAllocInfoList; output value
/// reports the number of GPU memory allocations.
/// @param [out] pAllocInfoList If pAllocInfoList=nullptr, then pNumEntries is ignored on input. On output it
/// will reflect the number of allocations that make up this pipeline. If
/// pAllocInfoList!=nullptr, then on input pNumEntries is assumed to be the number
/// of entries in the pAllocInfoList array. On output, pNumEntries reflects the
/// number of entries in pAllocInfoList that are valid.
/// @returns Success if the allocation info was successfully written to the buffer.
/// + ErrorInvalidValue if the caller provides a buffer size that is different from the size needed.
/// + ErrorInvalidPointer if pNumEntries is nullptr.
virtual Result QueryAllocationInfo(
size_t* pNumEntries,
GpuMemSubAllocInfo* const pAllocInfoList) const = 0;
/// Gives the client access to the resource ID used for internal Pal events.
/// EX: Resource Create, Resource Bind, Resource Destroy.
///
/// @returns The Resource ID.
virtual const void* GetResourceId() const = 0;
/// Obtains the binary code object for this library.
///
/// @param [in, out] pSize Represents the size of the shader ISA code.
///
/// @param [out] pBuffer If non-null, the library ELF is written in the buffer. If null, the size required
/// for the library ELF is given out in the location pSize.
///
/// @returns Success if the library binary was fetched successfully.
/// +ErrorUnavailable if the library binary was not fetched successfully.
virtual Result GetCodeObject(
uint32* pSize,
void* pBuffer) const = 0;
/// Returns the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @returns Pointer to client data.
void* GetClientData() const { return m_pClientData; }
/// Sets the value of the associated arbitrary client data pointer.
/// Can be used to associate arbitrary data with a particular PAL object.
///
/// @param [in] pClientData A pointer to arbitrary client data.
void SetClientData(
void* pClientData)
{
m_pClientData = pClientData;
}
/// Obtains the compiled shader ISA code for the shader function specified.
///
/// @param [in] pShaderExportName The shader exported name
///
/// @param [in, out] pSize Represents the size of the shader ISA code.
///
/// @param [out] pBuffer If non-null, the shader ISA code is written in the buffer. If null, the size required
/// for the shader ISA is given out in the location pSize.
///
/// @returns Success if the shader ISA code was fetched successfully.
/// +ErrorUnavailable if the shader ISA code was not fetched successfully.
virtual Result GetShaderFunctionCode(
Util::StringView<char> shaderExportName,
size_t* pSize,
void* pBuffer) const = 0;
/// Obtains the shader pre and post compilation stats/params for the specified shader.
///
/// @param [in] pShaderExportName The shader exported name
///
/// @param [out] pShaderStats Pointer to the ShaderStats structure which will be filled with the shader stats for
/// the shader stage mentioned in shaderType. This cannot be nullptr.
/// @param [in] getDisassemblySize If set to true performs disassembly on the shader binary code and reports the
/// size of the disassembly string in ShaderStats::isaSizeInBytes. Else reports 0.
/// @returns Success if the stats were successfully obtained for this shader, including the shader disassembly size.
/// +ErrorUnavailable if a wrong shader stage for this pipeline was specified, or if some internal error
/// occured.
virtual Result GetShaderFunctionStats(
Util::StringView<char> shaderExportName,
ShaderLibStats* pShaderStats) const = 0;
/// Returns the function list owned by this shader library
///
/// @returns A list of ShaderLibraryFunctionInfo.
virtual const Util::Span<const ShaderLibraryFunctionInfo> GetShaderLibFunctionInfos() const = 0;
protected:
/// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly
/// called the proper create method.
IShaderLibrary() : m_pClientData(nullptr) { }
/// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by
/// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
/// object on their own.
virtual ~IShaderLibrary() { }
private:
/// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
/// and set via SetClientData().
/// For non-top-layer objects, this will point to the layer above the current object.
void* m_pClientData;
IShaderLibrary(const IShaderLibrary&) = delete;
IShaderLibrary& operator=(const IShaderLibrary&) = delete;
};
} // Pal