Files
rocm-systems/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpaSession.h
T
systems-assistant[bot] 27f85500f8 Update amdgpu-windows-interop with latest changes 20251105 (#1728)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
2025-11-05 22:09:25 -05:00

1198 líneas
63 KiB
C++

/*
***********************************************************************************************************************
*
* Copyright (c) 2016-2025 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palGpaSession.h
* @brief PAL GPU utility GpaSession class.
***********************************************************************************************************************
*/
#pragma once
#include "palDeque.h"
#include "palDevice.h"
#include "palGpuUtil.h"
#include "palHashSet.h"
#include "palMutex.h"
#include "palPipeline.h"
#include "palVector.h"
#include "palPlatform.h"
#include "palSysMemory.h"
#include "palGpuMemory.h"
#include "palMemTrackerImpl.h"
// Forward declarations.
namespace Pal
{
class ICmdAllocator;
class ICmdBuffer;
class IDevice;
class IGpuEvent;
class IGpuMemory;
class IPerfExperiment;
class IQueue;
class IQueueSemaphore;
struct GlobalCounterLayout;
struct MultiSubmitInfo;
struct ThreadTraceLayout;
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 900
enum PipelineStageFlag : uint32;
#else
enum HwPipePoint : uint32;
#endif
}
struct SqttFileChunkCpuInfo;
struct SqttFileChunkAsicInfo;
struct SqttCodeObjectDatabaseRecord;
struct GpuMemoryInfo;
namespace GpuUtil
{
// Sample id initialization value.
constexpr Pal::uint32 InvalidSampleId = 0xFFFFFFFF;
/// The available states of GpaSession
enum class GpaSessionState : Pal::uint32
{
Reset = 0,
Building = 1,
Complete = 2,
Ready = 3,
};
/// The various ways you can change trace options after it has started.
enum class UpdateSampleTraceMode : Pal::uint32
{
MinimalToFullMask = 0, ///< Used to convert a minimal trace (needed for context in compute presents) to a full
/// trace according to the options in the active trace. Requires enableSampleUpdates.
/// Additionally, this must be called between BeginSample() and EndSample() and
/// queue timing must also be enabled on the GpaSession when this function is called.
StartInstructionTrace = 1, ///< Used to enable instruction-level trace globally at any time. Can be run without an
/// active sample. Useful for targeting specific parts of a frame.
StopInstructionTrace = 2, ///< Used to disable instruction-level trace globally at any time. Can be run without an
/// active sample.
};
/// Specifies basic type of sample to perfom - either a normal set of "global" perf counters, or a trace consisting
/// of SQ thread trace and/or streaming performance counters.
enum class GpaSampleType : Pal::uint32
{
None = 0x0, ///< No profile will be done.
Cumulative = 0x1, ///< One 64-bit result will be returned per performance counter representing the cumulative delta
/// for that counter over the sample period. Cumulative samples must begin and end in the same
/// command buffer.
Trace = 0x2, ///< A GPU memory buffer will be filled with hw-specific SQ thread trace and/or streaming
/// performance counter data. Trace samples may span multiple command buffers.
Timing = 0x3, ///< Two 64-bit results will be recorded in beginTs and endTs to gather timestamp data.
Query = 0x4, ///< A set of 11 pipeline stats will be collected.
Count
};
/// Specifies a specific performance counter to be sampled with GpaSession::BeginSample() and GpaSession::EndSample().
///
/// This identifies a specific counter in a particular HW block instance, e.g., TCC instance 3 counter #19. It is up
/// to the client to know the meaning of a particular counter, e.g., TCC #19 is TCC_PERF_SEL_MISS on Fiji. Eventually,
/// PAL may want to support certain counters without the client needing HW-specific knowledge (i.e., select an enum
/// called L2MissRate from PAL rather than needing to know that counter is TCC #19 on Fiji), but GPA currently works in
/// this low-level mode with other drivers, and wants to keep the flexibility.
struct PerfCounterId
{
Pal::GpuBlock block; ///< Which GPU block to reference (e.g., CB, DB, TCC).
Pal::uint32 instance; ///< Which instance of the specified GPU block to sample. E.g., Tahiti has 12 TCC blocks
/// (this number is returned per-block in the @ref Pal::GpuBlockPerfProperties structure).
/// There is no shortcut to get results for all instances of block in the whole chip, the
/// client must explicitly sample each instance and sum the results.
Pal::uint32 eventId; ///< Counter ID to sample. Note that the meaning of a particular eventId for a block can
/// change between chips.
union
{
struct
{
Pal::uint32 spm32Bit : 1; ///< For SPM counters, collect in 32bit instead of 16bit
Pal::uint32 reserved : 31; ///< Reserved for future use
};
Pal::uint32 u32All; ///< Union value for copying
} flags;
// Some blocks have additional per-counter controls. They must be properly programmed when adding counters for
// the relevant blocks. It's recommended to zero them out when not in use.
union
{
struct
{
Pal::uint32 eventQualifier; ///< The DF counters have an event-specific qualifier bitfield.
} df;
struct
{
Pal::uint16 eventThreshold; ///< Threshold value for those UMC counters having event-specific threshold.
Pal::uint8 eventThresholdEn; ///< Threshold enable (0 for disabled,1 for <threshold,2 for >threshold)
Pal::uint8 rdWrMask; ///< Read/Write mask select (1 for Read, 2 for Write).
} umc;
Pal::uint32 rs64Cntl; ///< CP blocks CPG and CPC have events that can be further filtered for processor events
Pal::uint32 u32All; ///< Union value for copying, must be increased in size if any element of the union exceeds
} subConfig;
};
/// Defines a set of flags for a particular gpa session.
union GpaSessionFlags
{
struct
{
/// Enables timing of queue operations via Timed* functions.
Pal::uint32 enableQueueTiming : 1;
/// Enables sample updates via the UpdateSampleTraceParams function.
Pal::uint32 enableSampleUpdates : 1;
/// Indicates that the client will use the internal Timed*QueueSemaphore() functions for queue semaphore timing
/// data. When not set it indicates the client will provide ETW data via the ExteralTimed* functions.
Pal::uint32 useInternalQueueSemaphoreTiming : 1;
/// Reserved for future use.
Pal::uint32 reserved : 29;
};
/// Flags packed as 32-bit uint.
Pal::uint32 u32All;
};
/// Specifies options that direct the gpa session behavior.
struct GpaSessionBeginInfo
{
/// Gpa Session flags used to control behavior.
GpaSessionFlags flags;
};
/// Input structure for CmdBeginGpuProfilerSample.
///
/// Defines a set of global performance counters and/or SQ thread trace data to be sampled.
struct GpaSampleConfig
{
/// Selects what type of data should be gathered for this sample. This can either be _cumulative_ to gather
/// simple deltas for the specified set of perf counters over the sample period, or it can be _trace_ to generate
/// a blob of RGP-formatted data containing SQ thread trace and/or streaming performance monitor data.
GpaSampleType type;
union
{
struct
{
Pal::uint32 sampleInternalOperations : 1; ///< Include BLTs and internal driver operations in the
/// results.
Pal::uint32 cacheFlushOnCounterCollection : 1; ///< Insert cache flush and invalidate events before and
/// after every sample.
Pal::uint32 sqShaderMask : 1; ///< If sqShaderMask is valid.
Pal::uint32 sqWgpShaderMask : 1; ///< If sqWgpShaderMask is valid.
Pal::uint32 reserved : 28; ///< Reserved for future use.
};
Pal::uint32 u32All; ///< Bit flags packed as uint32.
} flags; ///< Bit flags controlling sample operation for all sample
/// types.
Pal::PerfExperimentShaderFlags sqShaderMask; ///< Which shader stages are sampled by GpuBlock::Sq counters.
///< Only used if flags.sqShaderMask is set to 1.
Pal::PerfExperimentShaderFlags sqWgpShaderMask; ///< Which shader stages are sampled by GpuBlock::SqWgp counters.
///< Only used if flags.sqWgpShaderMask is set to 1.
struct
{
/// Number of entries in pIds.
Pal::uint32 numCounters;
/// List of performance counters to be gathered for a sample. If the sample type is _cumulative_ this will
/// result in "global" perf counters being sampled at the beginning of the sample period; if the sample type
/// is _trace_ this will result in SPM data being added to the sample's resulting RGP blob.
///
/// Note that it is up to the client to respect the hardware counter limit per block. This can be
/// determined by the maxGlobalOnlyCounters, maxGlobalSharedCounters, maxSpmCounters, and instanceGroupSize
/// fields of @ref Pal::GpuBlockPerfProperties.
const PerfCounterId* pIds;
/// Period for SPM sample collection in cycles. Only relevant for _trace_ samples.
Pal::uint32 spmTraceSampleInterval;
/// Maximum amount of GPU memory in bytes this sample can allocate for SPM data. Only relevant for _trace_
/// samples.
Pal::gpusize gpuMemoryLimit;
} perfCounters; ///< Performance counter selection (valid for both _cumulative_ and _trace_ samples).
struct
{
/// Number of entries in pIds.
Pal::uint32 numCounters;
/// Period for DF SPM sample collection in nano seconds.
Pal::uint32 sampleInterval;
/// Maximum amount of GPU memory in bytes this sample can allocate for DF SPM data.
Pal::gpusize gpuMemoryLimit;
/// List of performance counters to be gathered for a df sample. This has to be separate from the list
/// list of normal counters because it is a completely different mechanism for gathering data.
///
/// Note that it is up to the client to respect the hardware counter limit per block. This can be
/// determined by the maxSpmCounters fields of
/// @ref Pal::GpuBlockPerfProperties.
const PerfCounterId* pIds;
} dfSpmPerfCounters;
struct
{
union
{
struct
{
Pal::uint32 enable : 1; ///< Include SQTT data in the trace.
Pal::uint32 supressInstructionTokens : 1; ///< Prevents capturing instruction-level SQTT tokens,
/// significantly reducing the amount of sample data.
Pal::uint32 stallMode : 2; ///< Describes behavior when buffer full
Pal::uint32 stallAllSimds : 1; ///< Stall all SIMDs for thread trace stall.
Pal::uint32 excludeNonDetailShaderData : 1; ///< Only emit shader tokens from the SIMD that have been
/// selected for detail instruction tracing
Pal::uint32 enableExecPopTokens : 1; ///< Output exec tokens
Pal::uint32 reserved : 25; ///< Reserved for future use.
};
Pal::uint32 u32All; ///< Bit flags packed as uint32.
} flags; ///< Bit flags controlling SQTT samples.
Pal::uint32 seMask; ///< Mask that determines which specific SEs to run Thread trace on.
/// If 0, all SEs are enabled
Pal::uint32 seDetailedMask; ///< Mask that selects which specific SEs to reveal Thread trace detailed info.
/// If 0, all SEs will reveal detailed thread trace
Pal::gpusize gpuMemoryLimit; ///< Maximum amount of GPU memory in bytes this sample can allocate for the SQTT
/// buffer. If 0, allocate maximum size to prevent dropping tokens toward the
/// end of the sample.
Pal::uint32 tokenMask; ///< Mask indicating which SQTT tokens are requested for capture. If a tokenMask is
/// not provided, PAL will default to collecting all tokens or tokens except
/// instruction tokens if the supressInstructionTokens flag is set. Instruction
/// tokens will always be filtered out if supressInstructionTokens = true.
} sqtt; ///< SQ thread trace configuration (only valid for _trace_ samples).
struct
{
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 900
Pal::PipelineStageFlag preSample; ///< The pipeline stage in the GPU pipeline where the begin timestamp should
/// take place.
Pal::PipelineStageFlag postSample; ///< The pipeline stage in the GPU pipeline where the end timestamp should
/// take place.
#else
Pal::HwPipePoint preSample; ///< The point in the GPU pipeline where the begin timestamp should take place.
Pal::HwPipePoint postSample; ///< The point in the GPU pipeline where the end timestamp should take place.
#endif
} timing; ///< Timestamp configuration. (only valid for timing samples)
};
/// Extra metadata about a command buffer submission
struct TimedSubmitInfo
{
const Pal::uint64* pApiCmdBufIds; ///< Array of api specific command buffer ids
const Pal::uint32* pSqttCmdBufIds; ///< Array of sqtt command buffer ids
Pal::uint64 frameIndex; ///< The global frame index for the application.
};
/// Extra metadata about a queue semaphore operation
struct TimedQueueSemaphoreInfo
{
Pal::uint64 semaphoreID; ///< Api specific id associated with a semaphore.
};
/// Extra metadata about a queue present operation
struct TimedQueuePresentInfo
{
Pal::uint64 presentID; ///< Api specific id associated with a present.
};
/// Struct for storing information about gpu clock speeds.
struct GpuClocksSample
{
Pal::uint32 gpuEngineClockSpeed; // Current speed of the gpu engine clock in MHz
Pal::uint32 gpuMemoryClockSpeed; // Current speed of the gpu memory clock in MHz
};
/// Struct for storing CPU-side allocations of Pal::IPerfExperiment's.
struct PerfExperimentMemory
{
void* pMemory; // Memory allocated for an IPerfExperiment.
size_t memorySize; // Size of the memory allocated in pMemory.
};
/// Struct for supplying API-dependent information about pipelines.
struct RegisterPipelineInfo
{
Pal::uint64 apiPsoHash; ///< Client-provided PSO hash.
};
/// Struct for supplying API-dependent information about libraries.
struct RegisterLibraryInfo
{
Pal::uint64 apiHash; ///< Client-provided api hash.
};
/// Struct for supplying Elf binary.
struct ElfBinaryInfo
{
const void* pBinary; ///< FAT Elf binary.
Pal::uint32 binarySize; ///< FAT Elf binary size.
Pal::IGpuMemory* pGpuMemory; ///< GPU Memory where the compiled ISA resides.
Pal::gpusize offset; ///< Offset inside GPU memory object
Pal::uint64 originalHash; ///< Original source/binary hash.
Pal::uint64 compiledHash; ///< Compiled binary hash.
};
/// Enumeration of RGP trace profiling modes
enum class TraceProfilingMode : Pal::uint32
{
Present = 0, ///< Present triggered capture
UserMarkers = 1, ///< Capture triggered by user marker
FrameNumber = 2, ///< Capture based on frame number
Tags = 3, ///< Tag based capture
};
/// Constant defines the maximum length for a user marker string.
static constexpr Pal::uint32 UserMarkerStringLength = 256;
/// Defines data specific to each profiling mode used to capture an RGP trace.
union TraceProfilingModeData
{
struct
{
char start[UserMarkerStringLength]; ///< User marker string used to start trace capture.
char end[UserMarkerStringLength]; ///< User marker string used to end trace capture.
} userMarkerData;
struct
{
Pal::uint32 start; ///< Frame number used to start the trace.
Pal::uint32 end; ///< Frame number used to end the trace.
} frameNumberData;
struct
{
Pal::uint64 start; ///< Tag used to start the trace.
Pal::uint64 end; ///< Tag used to end the trace.
} tagData;
};
/// Enumerates the different instruction level data modes for an RGP trace
enum class InstructionTraceMode : Pal::uint32
{
Disabled = 0, ///< Instruction level data was disabled for trace.
FullFrame = 1, ///< Instruction level data was enabled for the full trace.
ApiPso = 2, ///< Instruction level data was enabled only for a single API PSO.
};
/// Defines the data used to control enabling of instruction level data.
struct InstructionTraceModeData
{
Pal::uint64 apiPsoHash; ///< Hash of the API PSO targeted for instruction level data.
};
/// Struct for supplying API specific information about an RGP trace
struct SampleTraceApiInfo
{
TraceProfilingMode profilingMode; ///< Profiling mode used to trigger the trace.
TraceProfilingModeData profilingModeData; ///< Profiling mode specific data.
InstructionTraceMode instructionTraceMode; ///< Instruction trace mode for the trace.
InstructionTraceModeData instructionTraceModeData; ///< Instruction trace mode data.
};
/// An enumeration of the API types.
enum class ApiType : Pal::uint32
{
DirectX12 = 0, ///< Represents DirectX12 API type.
Vulkan = 1, ///< Represents Vulkan API type.
Generic = 2, ///< Represents Generic API type.
OpenCl = 3, ///< Represents OpenCL API type.
Hip = 5, ///< Represents HIP API type.
};
/// Struct used for storing SQTT-specific trace information
struct SqttTraceInfo
{
Pal::uint32 shaderEngine; ///< Shader engine index
Pal::uint32 computeUnit; ///< Compute unit index
Pal::uint32 sqttVersion; ///< SQTT version
Pal::uint64 bufferSize; ///< SQTT trace buffer size
};
/// Struct used for storing SPM-specific trace information
struct SpmTraceInfo
{
Pal::uint32 numSpmCounters; ///< The number of SPM counters sampled in the trace
Pal::uint32 numTimestamps; ///< The number of timestamps that samples were taken
Pal::uint32 sampleFrequency; ///< The SPM counter sampling frequency
};
/// Struct used for storing QueueTimings-specific trace information
struct QueueTimingsTraceInfo
{
Pal::uint32 numQueueInfoRecords;
Pal::uint32 numQueueEventRecords;
Pal::uint32 queueInfoTableSize;
Pal::uint32 queueEventTableSize;
};
/**
***********************************************************************************************************************
* @class GpaSession
* @brief Helper class providing common driver functionality required by all PAL clients that support the GPUPerfAPI
* (GPA). Abstracts IPerfExperiment creation, memory management, completion confirmation, and results reporting
* at a level convenient for GPA. Each PAL client driver will need to publish an API extension exposing this
* support for use by GPA.
*
* A GpaSession is a container for a set of _samples_ of performance counter and/or SQ thread trace data. Its main
* purpose is to manage resources (IPerfExperiments and their backing system/GPU memory) in an efficient manner that is
* consistent with command buffer management in modern APIs. Consider GpaSession as a peer of DX12's command
* allocator or Vulkan's command pool objects.
*
* Basic flow of usage:
* - Newly create sessions are in the _reset_ state.
* - A session is moved from the _reset_ state to the _building_ state by calling Begin().
* - Samples are added to a session by specifying desired data for each query and marking a begin and end location
* in ICmdBuffers as they are built. Internally required resources, like GPU memory where counters will be
* written, are allocated from internal pools managed by the session.
* - A session is moved from the _building_ state to the _complete_ state by calling End().
* - The application will submit all command buffers referenced by the session.
* - The session is confirmed as _ready_, either using standard PAL fences to confirm all assocated submission have
* completed, or by polling IsReady() on the session.
* - Results for all samples in the session can be queried via GetResults().
* - Reset() should be called once results have been gathered and before building a new session. Resources are
* retained by the session object for use in the newly built session. The session object must be destroyed in
* order to fully release all resource back to the system.
*
* Cumulative-type samples may not span multiple command buffers, as other apps could interfere with the counts and
* there the final data doesn't have time-based visibility to detect that happened.
*
* @warning GpaSession is not thread safe. Performing samples in command buffers being built simultaneously by multiple
* threads should use multiple GpaSession objects.
***********************************************************************************************************************
*/
class GpaSession
{
typedef Pal::IPlatform GpaAllocator;
public:
typedef Util::Deque<PerfExperimentMemory, GpaAllocator> PerfExpMemDeque;
/// Constructor.
GpaSession(
Pal::IPlatform* pPlatform,
Pal::IDevice* pDevice,
Pal::uint16 apiMajorVer,
Pal::uint16 apiMinorVer,
ApiType apiType,
Pal::uint16 rgpInstrumentationSpecVer = 0,
Pal::uint16 rgpInstrumentationApiVer = 0,
PerfExpMemDeque* pAvailablePerfExpMem = nullptr);
~GpaSession();
/// Copy constructor creates an empty copy of a session.
///
/// Newly constructed session copies the GPU memory allocations and their layout from the source session, making
/// this a valid destination for a CopyResults command. This new object is effectively in the _complete_ state.
///
/// The purpose of such objects is to handle sampling data from bundles or nested command buffers where the same
/// set of commands might be executed multiple times from a single root-level command buffer. The client should
/// note such cases, and create a copy of the bundle's session for each invocation, then call CopyResults() from
/// the original session into the copy after the invocation.
///
/// @param [in] src Session to be copied. Must either be in the _complete_ or _ready_ state.
explicit GpaSession(const GpaSession& src);
/// Initialize the newly constructed GPA session.
Pal::Result Init();
/// Registers a queue with the GpaSession that will be submitted to using TimedSubmit. This must be called on any
/// queues that are submitted to via the Timed* functions. For Timed* signal and wait queue semaphore events, a
/// valid queueContext will be required (queueContext not equal to 0).
Pal::Result RegisterTimedQueue(Pal::IQueue* pQueue,
Pal::uint64 queueId,
Pal::uint64 queueContext);
/// Unregisters a queue prior to object destruction, and ensure that associated resources are destroyed. Work can
/// no longer be submitted on the queue after this has been called.
Pal::Result UnregisterTimedQueue(Pal::IQueue* pQueue);
Pal::Result TimedSubmit(Pal::IQueue* pQueue,
const Pal::MultiSubmitInfo& submitInfo,
const TimedSubmitInfo& timedSubmitInfo);
/// Executes a timed queue semaphore signal through the given queue. The HW time is measured when the queue semaphore
/// is signaled.
Pal::Result TimedSignalQueueSemaphore(Pal::IQueue* pQueue,
Pal::IQueueSemaphore* pQueueSemaphore,
const TimedQueueSemaphoreInfo& timedSignalInfo,
Pal::uint64 value = 0);
/// Executes a timed queue semaphore wait through the given queue. The HW time is measured when the queue semaphore
/// wait finishes.
Pal::Result TimedWaitQueueSemaphore(Pal::IQueue* pQueue,
Pal::IQueueSemaphore* pQueueSemaphore,
const TimedQueueSemaphoreInfo& timedWaitInfo,
Pal::uint64 value = 0);
/// Injects a timed queue present event.
Pal::Result TimedQueuePresent(Pal::IQueue* pQueue,
const TimedQueuePresentInfo& timedPresentInfo);
/// Injects a timed wait queue semaphore event using information supplied by an external source.
/// A valid queueContext (queueContext not equal to 0) is needed for this function.
Pal::Result ExternalTimedWaitQueueSemaphore(Pal::uint64 queueContext,
Pal::uint64 cpuSubmissionTimestamp,
Pal::uint64 cpuCompletionTimestamp,
const TimedQueueSemaphoreInfo& timedWaitInfo);
/// Injects a timed signal queue semaphore event using information supplied by an external source.
/// A valid queueContext (queueContext not equal to 0) is needed for this function.
Pal::Result ExternalTimedSignalQueueSemaphore(Pal::uint64 queueContext,
Pal::uint64 cpuSubmissionTimestamp,
Pal::uint64 cpuCompletionTimestamp,
const TimedQueueSemaphoreInfo& timedSignalInfo);
/// Queries the engine and memory clocks from DeviceProperties
Pal::Result SampleGpuClocks(GpuClocksSample* pGpuClocksSample) const;
/// Samples the timing clocks if queue timing is enabled and adds a clock sample entry to the current session.
Pal::Result SampleTimingClocks();
/// Moves the session from the _reset_ state to the _building_ state.
///
/// Invalid to call Begin() on a session that isn't in the _reset_ state.
///
/// @param [in] info Information about the gpa sessions desired behavior.
///
/// @returns Success if the session was successfully moved to the _building_ state. Otherwise, possible errors
/// include:
/// + ErrorUnavailable if the sessions isn't current in the _reset_ state.
Pal::Result Begin(const GpaSessionBeginInfo& info);
/// Moves the session from the _building_ state to the _complete_ state.
///
/// Invalid to call End() on a session that isn't in the _building_ state. The implementation _may_ insert GPU
/// commands into the specified pCmdBuf - in the case of a session that spans multiple command buffers, the
/// command buffer specified to End() _must_ be the last command buffer of the session that is submitted.
///
/// @param [in] pCmdBuf Last (normally _only_) command buffer of the session. Can be used by implementation
/// to insert GPU commands required after all samples are inserted (e.g., to confirm session
/// completion).
///
/// @returns Success if the session was successfully moved to the _complete_ state. Otherwise, possible errors
/// include:
/// + ErrorUnavailable if the sessions isn't current in the _building_ state.
Pal::Result End(Pal::ICmdBuffer* pCmdBuf);
/// Marks the beginning of a range of GPU operations to be measured and specifies what data should be recorded.
///
/// It is possible the sample will not succeed due to internal memory allocation failure, etc. In those cases,
/// the session will be marked invalid and no sample commands will be inserted. Reporting of this error is
/// delayed until GetResults().
///
/// A note for GpuBlock::SqWgp
/// Client of palPerfExperiment may configure counters of GpuBlock::SqWgp based on a per-wgp granularity
/// only if the following are disabled: GFXOFF, virtualization/SRIOV, VDDGFX (power down features), clock
/// gating (CGCG) and power gating. PAL expose this feature to clients.
/// If any of the conditions above cannot be met, it's the client's job to set all WGPs in the same SE to the same
/// perf counter programming. In this case, GpuBlock::SqWgp's perf counter works on a per-SE granularity.
/// Strictly speaking, it's not true that the counters work on a per-SE granularity when those power features
/// are enabled. It's all still per-WGP in HW, we just can't support different counter configs within the same SE.
/// The counter data is still reported per WGP (not aggregated for the whole SE).
///
/// Check the following two documents for details:
///
/// @param [in] pCmdBuf Command buffer to issue the begin sample commands. All operations performed
/// between executing the BeginSample() and EndSample() GPU commands will contribute to
/// the sample results.
/// @param [in] sampleConfig Describes what data should be sampled.
/// @param [out] pSampleId An ID corresponding to this sample. This ID should be recorded and passed back to
/// EndSample() when the sampled command buffer range is complete. This ID should also
/// be passed to GetResults() when the session is in the _ready_ state in order to get
/// the results of this sample.
///
/// @returns Success if the update was successful. Unsupported if the sample config type is not supported.
/// Otherwise, possible errors include:
/// + ErrorInvalidPointer if pCmdBuf or pSampleId is nullptr.
Pal::Result BeginSample(
Pal::ICmdBuffer* pCmdBuf,
const GpaSampleConfig& sampleConfig,
Pal::uint32* pSampleId);
/// Updates the trace parameters for a specific sample.
///
/// @param [in] pCmdBuf Command buffer to issue the update commands.
/// @param [in] sampleId Identifies the sample to be updated, if required by the mode. This should be a value
/// returned by BeginSample(), and must correspond to a thread trace sample.
/// @param [in] updateMode The way the sample parameters should be set. Some modes have additional restrictions.
/// @see UpdateSampleTraceMode
///
/// @returns Success if the update was successful. Otherwise, possible errors
/// include:
/// + ErrorInvalidPointer if pCmdBuf is nullptr.
/// + ErrorInvalidObjectType if a sample is required and the sample associated with sampleId is not a
/// trace sample.
Pal::Result UpdateSampleTraceParams(
Pal::ICmdBuffer* pCmdBuf,
Pal::uint32 sampleId,
UpdateSampleTraceMode updateMode);
/// Marks the end of a range of command buffer operations to be measured.
///
/// @param [in] pCmdBuf Command buffer to issue the end sample commands. All operations performed between
/// executing the BeginSample() and EndSample() GPU commands will contribute to the sample
/// results. _Cumulative_ samples (i.e., global performance counter samples) must never span
/// multiple command buffers (EndSample() should be called in the same command buffer as
/// BeginSample()).
/// @param [in] sampleId Identifies the sample to be ended. This should be the value returned by BeginSample()
/// for the sample that is being ended.
///
/// @note BeginSample() must be called before EndSample() _and_ the GPU commands inserted by BeginSample() must be
/// executed before the command inserted by EndSample(). Since a session is a single-threaded object, this
/// will normally happen naturally.
void EndSample(
Pal::ICmdBuffer* pCmdBuf,
Pal::uint32 sampleId);
/// Copies the DF SPM trace buffer to the GpaSession result buffer
///
/// @param [in] pCmdBuf Command buffer to issue the copy commands.
/// @param [in] sampleId Identifies the sample to be copied.
/// @note This must be called after a command buffer with the dfSpmTraceEnd CmdBufInfo flag
/// and with a separate command buffer. DF SPM traces are on a per command buffer granularity
/// because they are started and stopped by the KMD.
void CopyDfSpmTraceResults(
Pal::ICmdBuffer* pCmdBuf,
Pal::uint32 sampleId);
/// Provides API specific information about an RGP trace.
///
/// @param [in] traceApiInfo Const reference to the struct of API specific information.
/// @param [in] sampleId Sample ID (returned by BeginSample) for the RGP trace type sample info is being
/// provided for.
void SetSampleTraceApiInfo(
const SampleTraceApiInfo& traceApiInfo,
Pal::uint32 sampleId) const;
/// Reports if GPU execution of this session has completed and results are _ready_ for querying from the CPU via
/// GetResults().
///
/// @returns true if all samples in the session have completed GPU execution.
bool IsReady() const;
/// Reports results of a particular sample. Only valid for sessions in the _ready_ state.
///
/// Results will be formatted depending on the sample type:
/// + Cumulative: Results will be an array of uint64 values in the order of perf counter IDs specified by
/// BeginSample().
/// + SqThreadTrace: Results will be a binary blob in the RGP file format.
///
/// @param [in] sampleId Sample to be reported. Corresponds to value returned by BeginSample().
/// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space
/// available in pData, and *pSizeInBytes will be set to the amount of space written
/// to pData. If pData is null, *pSizeInBytes will be set to the amount of space
/// required.
/// @param [out] pData Can be null to query how much size is required (should only be necessary when
/// getting RGP data). If non-null, the sample results will be written to this
/// location.
///
/// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required
/// size is successfully written to pSizeInBytes). Otherwise, possible errors include:
/// + ErrorUnavailable if the session is not in the _ready_ state.
/// + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources.
/// + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results.
Pal::Result GetResults(
Pal::uint32 sampleId,
size_t* pSizeInBytes,
void* pData) const;
/// Retrieves the SQTT results. Only valid for sessions in the _complete_ state.
///
/// @param [in] sampleId Sample to be reported. Corresponds to value returned by BeginSample().
/// @param [in] traceIndex The index of the trace to get.
/// @param [out] pTraceInfoOut Optional pointer to a structure which will be written with information about the trace.
/// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space
/// available in pData, and *pSizeInBytes will be set to the amount of space written
/// to pData. If pData is null, *pSizeInBytes will be set to the amount of space
/// required.
/// @param [out] pData Can be null to query how much size is required.
/// If non-null, the sample results will be written to this location.
///
/// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required
/// size is successfully written to pSizeInBytes). Otherwise, possible errors include:
/// + ErrorUnavailable if the session is not in the _ready_ state.
/// + NotFound if the given index is not valid.
/// + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources.
/// + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results.
// + ErrorInvalidPointer if pSizeInBytes is NULL.
Pal::Result GetSqttTraceData(
Pal::uint32 sampleId,
Pal::uint32 traceIndex,
SqttTraceInfo* pTraceInfo,
size_t* pSizeInBytes,
void* pData) const;
/// Retrieves the SPM trace results of a particular sample. Only valid for 'Trace' type samples and sessions
/// in the _complete_ state.
///
/// Results in the output buffer are a binary blob formatted according to the RGP specification.
/// The data layout of the populated output buffer is as follows:
/// - Timestamps array [size: "numTimestamps * sizeof(uint64)" bytes]
/// - SpmCounterInfo array [size: "numSpmCounters * sizeof(SpmCounterInfo)" bytes]
/// - SPM Counter Data matrix [size: "*pSizeInBytes - (timestamps array + SpmCounterInfo array size)" bytes]
///
/// The SPM Counter Data matrix is laid out linearly in a row-major format. There are "numSpmCounters" rows and
/// "numTimestamps" columns. Each element in the matrix is either 16- or 32-bits, based on the "dataSize" field
/// of the corresponding "SpmCounterInfo" entry.
///
/// @param [in] sampleId Sample to be reported. Corresponds to value returned by BeginSample().
/// @param [out] pTraceInfo Optional. If non-null, this structure is populated with trace metadata.
/// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space
/// available in pData.
/// If pData is null, *pSizeInBytes will be set to the amount of space
/// required.
/// @param [out] pData Can be null to query how much size is required.
/// If non-null, the sample results will be written to this location.
///
/// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required
/// size is successfully written to pSizeInBytes). Otherwise, possible errors include:
/// + ErrorUnavailable if the session is not in the _ready_ state.
/// + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources.
/// + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results.
Pal::Result GetSpmTraceData(
Pal::uint32 sampleId,
SpmTraceInfo* pTraceInfo,
size_t* pSizeInBytes,
void* pData) const;
/// Retrieves the Queue Timings data from the active GpaSession.
/// Only valid when the GpaSession had `enableQueueTiming` flag set.
///
/// @param [out] pTraceInfo Optional. If non-null, this structure is populated with metadata.
/// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space
/// available in pData.
/// If pData is null, *pSizeInBytes will be set to the amount of space
/// required.
/// @param [out] pData Can be null to query how much size is required.
/// If non-null, the sample results will be written to this location.
///
/// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required
/// size is successfully written to pSizeInBytes). Otherwise, possible errors include:
/// + ErrorUnavailable if the session was not configured with `enableQueueTiming`.
Pal::Result GetQueueTimingsData(
QueueTimingsTraceInfo* pTraceInfo,
size_t* pSizeInBytes,
void* pData) const;
/// Moves the session to the _reset_ state, marking all sessions resources as unused and available for reuse when
/// the session is re-built.
///
/// @warning This function cannot be called when the session is queued for execution on the GPU. The client must
/// confirm this is not the case using IsReady(), fences, etc.
///
/// @returns Success if the session was successfully moved to the _reset_ state. Otherwise, possible errors
/// include:
/// + ErrorUnknown if an internal PAL error occurs.
Pal::Result Reset();
/// Uses the GPU to copy results from a nested command buffer's session into a root-level command buffer's per-
/// invocation session data.
///
/// This command will implicitly wait for the source session (as specified in the copy constructor) to be complete
/// then use the GPU to update this session's data. This allows the client to get accurate sample data in the
/// case where a nested command buffer is launched multiple times from the same root-level command buffer.
///
/// The session remains in the _complete_ state after calling this, and the client should submit the commands
/// and verify their completion to move to the _ready_ state.
///
/// @param pCmdBuf Command buffer where the session copy should be performed.
void CopyResults(Pal::ICmdBuffer* pCmdBuf);
/// Register pipeline with GpaSession for obtaining shader dumps and load events in the RGP file.
///
/// @param [in] pPipeline The PAL pipeline to be tracked.
/// @param [in] clientInfo API-dependent information for this pipeline to also be recorded.
///
/// @returns Success if the pipeline has been registered with GpaSession successfully.
/// + AlreadyExists if a duplicate pipeline is provided.
Pal::Result RegisterPipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo);
/// Unregister pipeline with GpaSession for obtaining unload events in the RGP file.
/// This should be called immediately before destroying the PAL pipeline object.
///
/// @param [in] pPipeline The PAL pipeline to be tracked.
///
/// @returns Success if the pipeline has been unregistered with GpaSession successfully.
Pal::Result UnregisterPipeline(const Pal::IPipeline* pPipeline);
/// Register library with GpaSession for obtaining shader dumps and load events in the RGP file.
///
/// @param [in] pLibrary The PAL library to be tracked.
/// @param [in] clientInfo API-dependent information for this library to also be recorded.
///
/// @returns Success if the library has been registered with GpaSession successfully.
/// + AlreadyExists if a duplicate library is provided.
Pal::Result RegisterLibrary(const Pal::IShaderLibrary* pLibrary, const RegisterLibraryInfo& clientInfo);
/// Unregister library with GpaSession for obtaining unload events in the RGP file.
/// This should be called immediately before destroying the PAL library object.
///
/// @param [in] pLibrary The PAL library to be tracked.
///
/// @returns Success if the library has been unregistered with GpaSession successfully.
Pal::Result UnregisterLibrary(const Pal::IShaderLibrary* pLibrary);
/// Register ELF binary with GpaSession for obtaining kernel dumps and load events in the RGP file.
///
/// @param [in] elfBinaryInfo Contains information about the Elf binary to be recorded.
///
/// @returns Success if the Elf binary has been registered with GpaSession successfully.
Pal::Result RegisterElfBinary(const ElfBinaryInfo& elfBinaryInfo);
/// Unregister Elf binary with GpaSession for obtaining unload events in the RGP file.
/// This should be called immediately before destroying the Elf binary.
///
/// @param [in] elfBinaryInfo Contains the elf binary info to be removed from tracking.
///
/// @returns Success if the library has been unregistered with GpaSession successfully.
Pal::Result UnregisterElfBinary(const ElfBinaryInfo& elfBinaryInfo);
/// Given a Pal device, validate a list of perfcounters.
///
/// @param [in] pDevice a given device
/// @param [in] pCounters a list of perf counters.
/// @param [in] numCounters perf counter counts.
///
/// @returns Success if counters are valid.
Pal::Result ValidatePerfCounters(Pal::IDevice* pDevice,
const PerfCounterId* pCounters,
const Pal::uint32 numCounters);
private:
// Tracking structure for a single IGpuMemory allocation owned by a GpaSession::GpaSession. In particular, it
// tracks the associated CPU pointer since these allocations remain mapped for CPU access for their lifetime.
struct GpuMemoryInfo
{
Pal::IGpuMemory* pGpuMemory;
void* pCpuAddr;
};
// Event type for code object load events
enum class CodeObjectLoadEventType
{
LoadToGpuMemory = 0,
UnloadFromGpuMemory
};
// Represents all information to be contained in one SqttCodeObjectLoaderEventRecord
struct CodeObjectLoadEventRecord
{
CodeObjectLoadEventType eventType;
Pal::uint64 baseAddress;
Pal::ShaderHash codeObjectHash;
Pal::uint64 timestamp;
};
// Represents all information to be contained in one SqttPsoCorrelationRecord
struct PsoCorrelationRecord
{
Pal::uint64 apiPsoHash;
Pal::PipelineHash internalPipelineHash;
};
// Registers a single (non-archive) pipeline with the GpaSession. Returns AlreadyExists on duplicate PAL pipeline.
Pal::Result RegisterSinglePipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo);
// Unregisters a single (non-archive) pipeline from the GpaSession.
Pal::Result UnregisterSinglePipeline(const Pal::IPipeline* pPipeline);
Pal::IDevice*const m_pDevice; // Device associated with this GpaSession.
Pal::DeviceProperties m_deviceProps;
Pal::SetClockModeOutput m_peakClockFrequency; // Output of query for stable peak, values in Mhz
Pal::PerfExperimentProperties m_perfExperimentProps;
Pal::uint32 m_timestampAlignment; // Pre-calculated timestamp data alignment.
ApiType m_apiType; // API type, e.g. Vulkan, used in RGP dumps.
Pal::uint16 m_apiMajorVer; // API major version, used in RGP dumps.
Pal::uint16 m_apiMinorVer; // API minor version, used in RGP dumps.
Pal::uint16 m_instrumentationSpecVersion; // Spec version of RGP instrumetation.
Pal::uint16 m_instrumentationApiVersion; // Api version of RGP instrumetation.
Pal::IGpuEvent* m_pGpuEvent;
GpaSessionState m_sessionState;
const GpaSession* const m_pSrcSession; // source session for session created via copy c'tor
// Tracks the current GPU memory object and offset being sub-allocated for AcquireGpuMem().
GpuMemoryInfo m_curGartGpuMem;
Pal::gpusize m_curGartGpuMemOffset;
GpuMemoryInfo m_curLocalGpuMem;
Pal::gpusize m_curLocalGpuMemOffset;
GpuMemoryInfo m_curInvisGpuMem;
Pal::gpusize m_curInvisGpuMemOffset;
// Locks for the local-invisible, gart and local memory subdivision (and their pools)
Util::Mutex m_gartGpuMemLock;
Util::Mutex m_localGpuMemLock;
Util::Mutex m_invisGpuMemLock;
// Counts number of samples that are active in this GpaSession.
Pal::uint32 m_sampleCount;
Pal::IPlatform*const m_pPlatform; // Platform associated with this GpaSesion.
// GartHeap / LocalHeap / InvisHeap GPU chunk pools.
Util::Deque<GpuMemoryInfo, GpaAllocator> m_availableGartGpuMem;
Util::Deque<GpuMemoryInfo, GpaAllocator> m_busyGartGpuMem;
Util::Deque<GpuMemoryInfo, GpaAllocator> m_availableLocalGpuMem;
Util::Deque<GpuMemoryInfo, GpaAllocator> m_busyLocalGpuMem;
Util::Deque<GpuMemoryInfo, GpaAllocator> m_availableInvisGpuMem;
Util::Deque<GpuMemoryInfo, GpaAllocator> m_busyInvisGpuMem;
struct SampleItem;
class PerfSample;
class CounterSample;
class TraceSample;
class TimingSample;
class QuerySample;
Util::Vector<SampleItem*, 16, GpaAllocator> m_sampleItemArray;
PerfExpMemDeque* m_pAvailablePerfExpMem;
// Unique pipelines registered with this GpaSession.
Util::HashSet<Pal::uint64, GpaAllocator, Util::JenkinsHashFunc> m_registeredPipelines;
// Unique API PSOs registered with this GpaSession.
Util::HashSet<Pal::uint64, GpaAllocator, Util::JenkinsHashFunc> m_registeredApiHashes;
// List of cached pipeline code object records that will be copied to the final database at the end of a trace
Util::Deque<SqttCodeObjectDatabaseRecord*, GpaAllocator> m_codeObjectRecordsCache;
// List of pipeline code object records that were registered during a trace
Util::Deque<SqttCodeObjectDatabaseRecord*, GpaAllocator> m_curCodeObjectRecords;
// List of cached code object load event records that will be copied to the final database at the end of a trace
Util::Deque<CodeObjectLoadEventRecord, GpaAllocator> m_codeObjectLoadEventRecordsCache;
// List of code object load event records that were registered during a trace
Util::Deque<CodeObjectLoadEventRecord, GpaAllocator> m_curCodeObjectLoadEventRecords;
// List of cached PSO correlation records that will be copied to the final database at the end of a trace
Util::Deque<PsoCorrelationRecord, GpaAllocator> m_psoCorrelationRecordsCache;
// List of PSO correlation records that were registered during a trace
Util::Deque<PsoCorrelationRecord, GpaAllocator> m_curPsoCorrelationRecords;
Util::RWLock m_registerPipelineLock;
// Event type for timed queue events
enum class TimedQueueEventType : Pal::uint32
{
Submit,
Signal,
Wait,
Present,
ExternalSignal,
ExternalWait
};
// Struct that contains information about a specific timed queue event.
struct TimedQueueEventItem
{
TimedQueueEventType eventType; // Type of event
Pal::uint64 cpuTimestamp; // Time when the event was processed on the cpu
Pal::uint64 apiId; // The api specific id for the queue event
Pal::uint32 sqttCmdBufId; // The sqtt command buffer id value associated with a submit event
Pal::uint32 submitSubIndex; // The sub index of an event within a submission event.
Pal::uint32 queueIndex; // The index of the associated queue in the m_timedQueuesArray
Pal::uint64 frameIndex; // The index of the current frame being rendered
union
{
struct
{
GpuMemoryInfo memInfo[2]; // The gpu memory for the timestamps associated with the event
Pal::gpusize offsets[2]; // Memory offsets for the associated timestamp gpu memory
} gpuTimestamps;
Pal::uint64 cpuCompletionTimestamp; // The time when the event completed on the cpu
};
};
// Struct for keeping track of timed operation on a specific queue
struct TimedQueueState
{
Pal::IQueue* pQueue; // Pal Queue
Pal::uint64 queueId; // Api specific queue id
Pal::uint64 queueContext; // Api specific queue context
Pal::QueueType queueType; // Queue type
Pal::EngineType engineType; // Engine type
bool valid; // Used to track if the queue is valid
Util::Deque<Pal::ICmdBuffer*, GpaAllocator>* pAvailableCmdBuffers; // List of available cmdbuffers
Util::Deque<Pal::ICmdBuffer*, GpaAllocator>* pBusyCmdBuffers; // List of busy cmdbuffers
Pal::IFence* pFence; // Used to track queue
// operations
};
// Flags for the current session.
GpaSessionFlags m_flags;
// Array containing all of the queues registered for timing operations
Util::Vector<TimedQueueState*, 8, GpaAllocator> m_timedQueuesArray;
Util::RWLock m_timedQueuesArrayLock;
// List of timed queue events for the current session
Util::Vector<TimedQueueEventItem, 16, GpaAllocator> m_queueEvents;
Util::Mutex m_queueEventsLock;
// List of timestamp calibration samples
Util::Vector<Pal::CalibratedTimestamps, 4, GpaAllocator> m_timestampCalibrations;
// The most recent gpu clocks sample
GpuClocksSample m_lastGpuClocksSample;
// Internal command allocator used for timing command buffers
Pal::ICmdAllocator* m_pCmdAllocator;
// Finds the TimedQueueState associated with pQueue.
Pal::Result FindTimedQueue(Pal::IQueue* pQueue,
TimedQueueState** ppQueueState,
Pal::uint32* pQueueIndex);
// Finds the TimedQueueState associated with queueContext.
Pal::Result FindTimedQueueByContext(Pal::uint64 queueContext,
TimedQueueState** ppQueueState,
Pal::uint32* pQueueIndex);
/// Injects an external timed queue semaphore operation event
Pal::Result ExternalTimedQueueSemaphoreOperation(Pal::uint64 queueContext,
Pal::uint64 cpuSubmissionTimestamp,
Pal::uint64 cpuCompletionTimestamp,
const TimedQueueSemaphoreInfo& timedSemaphoreInfo,
bool isSignalOperation);
/// Converts a CPU timestamp to a GPU timestamp using a CalibratedTimestamps struct
Pal::uint64 ConvertCpuTimestampToGpuTimestamp(Pal::uint64 cpuTimestamp,
const Pal::CalibratedTimestamps& calibration) const;
/// Extracts a GPU timestamp from a queue event
Pal::uint64 ExtractGpuTimestampFromQueueEvent(const TimedQueueEventItem& queueEvent) const;
// Creates a new command buffer for use on pQueue
Pal::Result CreateCmdBufferForQueue(Pal::IQueue* pQueue,
Pal::ICmdBuffer** ppCmdBuffer);
// Acquires a command buffer from the TimedQueueState's command buffer pool
Pal::Result AcquireTimedQueueCmdBuffer(TimedQueueState* pQueueState,
Pal::ICmdBuffer** ppCmdBuffer);
// Recycles busy command buffers in pQueueState
Pal::Result RecycleTimedQueueCmdBuffers(TimedQueueState* pQueueState);
// Preallocates a fixed number of command buffers for pQueueState and adds them to the command buffer pool
Pal::Result PreallocateTimedQueueCmdBuffers(TimedQueueState* pQueueState,
Pal::uint32 numCmdBuffers);
// Resets all per session state in pQueueState
Pal::Result ResetTimedQueueState(TimedQueueState* pQueueState);
// Destroys the memory and resources for pQueueState
void DestroyTimedQueueState(TimedQueueState* pQueueState);
// Helper function to import one sample item from a source session to copy session.
Pal::Result ImportSampleItem(const SampleItem* pSrcSampleItem);
// Acquires a range of queue-owned GPU memory for use by the next command buffer submission.
Pal::Result AcquireGpuMem(
Pal::gpusize size,
Pal::gpusize alignment,
Pal::GpuHeap heapType,
Pal::GpuMemMallPolicy mallPolicy,
GpuMemoryInfo* pGpuMem,
Pal::gpusize* pOffset);
// Acquires a GpaSession-owned performance experiment based on the device's active perf counter requests.
Pal::Result AcquirePerfExperiment(
GpaSession::SampleItem* pSampleItem,
const GpaSampleConfig& sampleConfig,
GpuMemoryInfo* pGpuMem,
Pal::gpusize* pOffset,
GpuMemoryInfo* pSecondaryGpuMem,
Pal::gpusize* pSecondaryOffset,
Pal::gpusize* pHeapSize,
Pal::IPerfExperiment** ppExperiment);
// Acquires a session-owned pipeline stats query.
Pal::Result AcquirePipeStatsQuery(
GpuMemoryInfo* pGpuMem,
Pal::gpusize* pOffset,
Pal::gpusize* pHeapSize,
Pal::IQueryPool** ppQuery);
// Dump SQ thread trace data in rgp format
Pal::Result DumpRgpData(const GpaSampleConfig* pTraceConfig,
TraceSample* pTraceSample,
void* pRgpOutput,
size_t* pTraceSize) const;
// Dumps the spm trace data in the buffer provided.
Pal::Result AppendSpmTraceData(TraceSample* pTraceSample,
size_t bufferSize,
void* pData,
Pal::gpusize* pSizeInBytes) const;
// Dumps the df spm trace data in the buffer provided.
Pal::Result AppendDfSpmTraceData(TraceSample* pTraceSample,
size_t bufferSize,
void* pData,
Pal::gpusize* pSizeInBytes) const;
Pal::Result AddCodeObjectLoadEvent(const Pal::IPipeline* pPipeline, CodeObjectLoadEventType eventType);
Pal::Result AddCodeObjectLoadEvent(const Pal::IShaderLibrary* pLibrary, CodeObjectLoadEventType eventType);
Pal::Result AddCodeObjectLoadEvent(const ElfBinaryInfo& elfBinaryInfo, CodeObjectLoadEventType eventType);
// Recycle used Gart rafts and put back to available pool
void RecycleGartGpuMem();
// Recycle used Local rafts and put back to available pool
void RecycleLocalGpuMem();
// Recycle used Invisible rafts and put back to available pool
void RecycleInvisGpuMem();
// Destroy and free one sample item and its sub-items.
void FreeSampleItem(GpaSession::SampleItem* pSampleItem);
// Destroy and free the m_sampleItemArray and associated memory allocation
void FreeSampleItemArray();
// Destroy the sub-items in m_sampleItemArray but keep associated memory allocations.
void RecycleSampleItemArray();
// Helper function to destroy the GpuMemoryInfo object
void DestroyGpuMemoryInfo(GpuMemoryInfo* pGpuMemoryInfo);
PAL_DISALLOW_DEFAULT_CTOR(GpaSession);
GpaSession& operator =(const GpaSession&);
};
} // GpuUtil