eecbc2e436
It seems that due to removal of vdev()->isHandlerPending(), Marker queued to ensure finish is not enqueued and that cause hung at waiting event for kernel enqueue command. Change-Id: I364abb2dcb4897b11a7eb61b5d85013b69292792
2067 строки
71 KiB
C++
2067 строки
71 KiB
C++
/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE. */
|
|
|
|
#ifndef DEVICE_HPP_
|
|
#define DEVICE_HPP_
|
|
|
|
#include "top.hpp"
|
|
#include "thread/thread.hpp"
|
|
#include "thread/monitor.hpp"
|
|
#include "platform/context.hpp"
|
|
#include "platform/object.hpp"
|
|
#include "platform/memory.hpp"
|
|
#include "utils/util.hpp"
|
|
#include "amdocl/cl_kernel.h"
|
|
#include "elf/elf.hpp"
|
|
#include "appprofile.hpp"
|
|
#include "devprogram.hpp"
|
|
#include "devkernel.hpp"
|
|
#include "amdocl/cl_profile_amd.h"
|
|
#if defined(WITH_COMPILER_LIB)
|
|
#include "hsailctx.hpp"
|
|
#endif
|
|
#include "devsignal.hpp"
|
|
|
|
#if defined(__clang__)
|
|
#if __has_feature(address_sanitizer)
|
|
#include "devurilocator.hpp"
|
|
#endif
|
|
#endif
|
|
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <map>
|
|
#include <mutex>
|
|
#include <list>
|
|
#include <set>
|
|
#include <unordered_set>
|
|
#include <utility>
|
|
|
|
namespace amd {
|
|
class Command;
|
|
class CommandQueue;
|
|
class ReadMemoryCommand;
|
|
class WriteMemoryCommand;
|
|
class FillMemoryCommand;
|
|
class CopyMemoryCommand;
|
|
class CopyMemoryP2PCommand;
|
|
class MapMemoryCommand;
|
|
class UnmapMemoryCommand;
|
|
class MigrateMemObjectsCommand;
|
|
class NDRangeKernelCommand;
|
|
class NativeFnCommand;
|
|
class FlushCommand;
|
|
class FinishCommand;
|
|
class AcquireExtObjectsCommand;
|
|
class ReleaseExtObjectsCommand;
|
|
class PerfCounterCommand;
|
|
class ReleaseObjectCommand;
|
|
class StallQueueCommand;
|
|
class Marker;
|
|
class AccumulateCommand;
|
|
class ThreadTraceCommand;
|
|
class ThreadTraceMemObjectsCommand;
|
|
class SignalCommand;
|
|
class MakeBuffersResidentCommand;
|
|
class SvmFreeMemoryCommand;
|
|
class SvmCopyMemoryCommand;
|
|
class SvmFillMemoryCommand;
|
|
class SvmMapMemoryCommand;
|
|
class SvmUnmapMemoryCommand;
|
|
class SvmPrefetchAsyncCommand;
|
|
class StreamOperationCommand;
|
|
class VirtualMapCommand;
|
|
class ExternalSemaphoreCmd;
|
|
class Isa;
|
|
class Device;
|
|
struct KernelParameterDescriptor;
|
|
struct Coord3D;
|
|
|
|
//! @note: the defines match hip values
|
|
enum MemoryAdvice : uint32_t {
|
|
SetReadMostly = 1, ///< Data will mostly be read and only occassionally be written to
|
|
UnsetReadMostly = 2, ///< Undo the effect of hipMemAdviseSetReadMostly
|
|
SetPreferredLocation = 3, ///< Set the preferred location for the data as the specified device
|
|
UnsetPreferredLocation = 4, ///< Clear the preferred location for the data
|
|
SetAccessedBy = 5, ///< Data will be accessed by the specified device, reducing
|
|
///< the amount of page faults
|
|
UnsetAccessedBy = 6, ///< HMM decides on the page faulting policy for the specified device
|
|
SetCoarseGrain = 100, ///< Change cache policy to improve performance (disables coherency)
|
|
UnsetCoarseGrain = 101 ///< Restore coherent cache policy at the cost of some performance
|
|
};
|
|
|
|
enum MemRangeAttribute : uint32_t {
|
|
ReadMostly = 1, ///< Whether the range will mostly be read and only
|
|
///< occassionally be written to
|
|
PreferredLocation = 2, ///< The preferred location of the range
|
|
AccessedBy = 3, ///< Memory range has hipMemAdviseSetAccessedBy
|
|
///< set for specified device
|
|
LastPrefetchLocation = 4, ///< The last location to which the range was prefetched
|
|
CoherencyMode = 100, ///< Current coherency mode for the specified range
|
|
};
|
|
|
|
constexpr int CpuDeviceId = static_cast<int>(-1);
|
|
constexpr int InvalidDeviceId = static_cast<int>(-2);
|
|
|
|
enum class ExternalSemaphoreHandleType : uint32_t {
|
|
OpaqueFd = 1, // Handle is an opaque file descriptor
|
|
OpaqueWin32 = 2, // Handle is an opaque shared NT handle
|
|
OpaqueWin32Kmt = 3, // Handle is an opaque, globally shared handle
|
|
D3D12Fence = 4, // Handle is a shared NT handle referencing a
|
|
// D3D12 fence object
|
|
D3D11Fence = 5, // Handle is a shared NT handle referencing a
|
|
// D3D11 fence object
|
|
NvSciSync = 6, // Opaque handle to NvSciSync Object
|
|
KeyedMutex = 7, // Handle is a shared NT handle referencing a
|
|
// D3D11 keyed mutex object
|
|
KeyedMutexKmt = 8, // Handle is a shared KMT handle referencing a
|
|
// D3D11 keyed mutex object
|
|
TimelineSemaphoreFd = 9, // Handle is an opaque handle file
|
|
// descriptor referencing a timeline
|
|
// semaphore
|
|
TimelineSemaphoreWin32 = 10, // Handle is an opaque handle file
|
|
// descriptor referencing a timeline
|
|
// semaphore
|
|
};
|
|
} // namespace amd
|
|
|
|
enum OclExtensions {
|
|
ClKhrFp64 = 0,
|
|
ClAmdFp64,
|
|
ClKhrSelectFpRoundingMode,
|
|
ClKhrGlobalInt32BaseAtomics,
|
|
ClKhrGlobalInt32ExtendedAtomics,
|
|
ClKhrLocalInt32BaseAtomics,
|
|
ClKhrLocalInt32ExtendedAtomics,
|
|
ClKhrInt64BaseAtomics,
|
|
ClKhrInt64ExtendedAtomics,
|
|
ClKhr3DImageWrites,
|
|
ClKhrByteAddressableStore,
|
|
ClKhrFp16,
|
|
ClKhrGlSharing,
|
|
ClKhrGLDepthImages,
|
|
ClExtDeviceFission,
|
|
ClAmdDeviceAttributeQuery,
|
|
ClAmdVec3,
|
|
ClAmdPrintf,
|
|
ClAmdMediaOps,
|
|
ClAmdMediaOps2,
|
|
ClAmdPopcnt,
|
|
#if defined(_WIN32)
|
|
ClKhrD3d10Sharing,
|
|
ClKhrD3d11Sharing,
|
|
ClKhrD3d9Sharing,
|
|
#endif
|
|
ClKhrImage2dFromBuffer,
|
|
ClAMDBusAddressableMemory,
|
|
ClAMDC11Atomics,
|
|
ClKhrSpir,
|
|
ClKhrSubGroups,
|
|
ClKhrGlEvent,
|
|
ClKhrDepthImages,
|
|
ClKhrMipMapImage,
|
|
ClKhrMipMapImageWrites,
|
|
ClAmdCopyBufferP2P,
|
|
ClAmdAssemblyProgram,
|
|
#if defined(_WIN32)
|
|
ClAmdPlanarYuv,
|
|
#endif
|
|
ClExtTotal
|
|
};
|
|
|
|
static constexpr const char* OclExtensionsString[] = {"cl_khr_fp64 ",
|
|
"cl_amd_fp64 ",
|
|
"cl_khr_select_fprounding_mode ",
|
|
"cl_khr_global_int32_base_atomics ",
|
|
"cl_khr_global_int32_extended_atomics ",
|
|
"cl_khr_local_int32_base_atomics ",
|
|
"cl_khr_local_int32_extended_atomics ",
|
|
"cl_khr_int64_base_atomics ",
|
|
"cl_khr_int64_extended_atomics ",
|
|
"cl_khr_3d_image_writes ",
|
|
"cl_khr_byte_addressable_store ",
|
|
"cl_khr_fp16 ",
|
|
"cl_khr_gl_sharing ",
|
|
"cl_khr_gl_depth_images ",
|
|
"cl_ext_device_fission ",
|
|
"cl_amd_device_attribute_query ",
|
|
"cl_amd_vec3 ",
|
|
"cl_amd_printf ",
|
|
"cl_amd_media_ops ",
|
|
"cl_amd_media_ops2 ",
|
|
"cl_amd_popcnt ",
|
|
#if defined(_WIN32)
|
|
"cl_khr_d3d10_sharing ",
|
|
"cl_khr_d3d11_sharing ",
|
|
"cl_khr_dx9_media_sharing ",
|
|
#endif
|
|
"cl_khr_image2d_from_buffer ",
|
|
"cl_amd_bus_addressable_memory ",
|
|
"cl_amd_c11_atomics ",
|
|
"cl_khr_spir ",
|
|
"cl_khr_subgroups ",
|
|
"cl_khr_gl_event ",
|
|
"cl_khr_depth_images ",
|
|
"cl_khr_mipmap_image ",
|
|
"cl_khr_mipmap_image_writes ",
|
|
"cl_amd_copy_buffer_p2p ",
|
|
"cl_amd_assembly_program ",
|
|
#if defined(_WIN32)
|
|
"cl_amd_planar_yuv",
|
|
#endif
|
|
NULL};
|
|
|
|
static constexpr int AmdVendor = 0x1002;
|
|
|
|
template <typename T>
|
|
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
|
|
T src, //!< The source pointer
|
|
uint size, //!< The size in bytes to copy
|
|
size_t offset //!< The alignment to follow while writing to the buffer
|
|
) {
|
|
assert(sizeof(T) <= size && "Argument's size mismatches ABI!");
|
|
*(reinterpret_cast<T*>(dst + offset)) = src;
|
|
}
|
|
|
|
namespace device {
|
|
class ClBinary;
|
|
class BlitManager;
|
|
class Program;
|
|
class Kernel;
|
|
|
|
//! Physical device properties.
|
|
struct Info : public amd::EmbeddedObject {
|
|
//! The OpenCL device type.
|
|
cl_device_type type_;
|
|
|
|
//! A unique device vendor identifier.
|
|
uint32_t vendorId_;
|
|
|
|
//! The available number of parallel compute cores on the compute device.
|
|
uint32_t maxComputeUnits_;
|
|
|
|
//! The max number of parallel compute cores on the compute device.
|
|
uint32_t maxPhysicalComputeUnits_;
|
|
|
|
//! Maximum dimensions that specify the global and local work-item IDs
|
|
// used by the data-parallel execution model.
|
|
uint32_t maxWorkItemDimensions_;
|
|
|
|
//! Maximum number of work-items that can be specified in each dimension
|
|
// to clEnqueueNDRangeKernel.
|
|
size_t maxWorkItemSizes_[3];
|
|
|
|
//! Maximum number of work-items in a work-group executing a kernel
|
|
// using the data-parallel execution model.
|
|
size_t maxWorkGroupSize_;
|
|
|
|
//! Preferred number of work-items in a work-group executing a kernel
|
|
// using the data-parallel execution model.
|
|
size_t preferredWorkGroupSize_;
|
|
|
|
//! Number of shader engines in physical GPU
|
|
size_t numberOfShaderEngines;
|
|
|
|
//! uint32_t Preferred native vector width size for built-in scalar types
|
|
// that can be put into vectors.
|
|
uint32_t preferredVectorWidthChar_;
|
|
uint32_t preferredVectorWidthShort_;
|
|
uint32_t preferredVectorWidthInt_;
|
|
uint32_t preferredVectorWidthLong_;
|
|
uint32_t preferredVectorWidthFloat_;
|
|
uint32_t preferredVectorWidthDouble_;
|
|
uint32_t preferredVectorWidthHalf_;
|
|
|
|
//! Returns the native ISA vector width. The vector width is defined as the
|
|
// number of scalar elements that can be stored in the vector.
|
|
uint32_t nativeVectorWidthChar_;
|
|
uint32_t nativeVectorWidthShort_;
|
|
uint32_t nativeVectorWidthInt_;
|
|
uint32_t nativeVectorWidthLong_;
|
|
uint32_t nativeVectorWidthFloat_;
|
|
uint32_t nativeVectorWidthDouble_;
|
|
uint32_t nativeVectorWidthHalf_;
|
|
|
|
//! Maximum configured engine clock frequency of the device in MHz.
|
|
uint32_t maxEngineClockFrequency_;
|
|
|
|
//! Maximum configured memory clock frequency of the device in MHz.
|
|
uint32_t maxMemoryClockFrequency_;
|
|
|
|
//! The constant frequency of wall clock in KHz
|
|
uint32_t wallClockFrequency_;
|
|
|
|
//! Memory bus width in bits.
|
|
uint32_t vramBusBitWidth_;
|
|
|
|
//! Size of L2 Cache in bytes.
|
|
uint32_t l2CacheSize_;
|
|
|
|
//! Timestamp frequency in Hz.
|
|
uint32_t timeStampFrequency_;
|
|
|
|
//! Describes the address spaces supported by the device.
|
|
uint32_t addressBits_;
|
|
|
|
//! Max number of simultaneous image objects that can be read by a
|
|
// kernel.
|
|
uint32_t maxReadImageArgs_;
|
|
|
|
//! Max number of simultaneous image objects that can be written to
|
|
// by a kernel.
|
|
uint32_t maxWriteImageArgs_;
|
|
|
|
//! Max number of simultaneous image objects that can be read/written to
|
|
// by a kernel.
|
|
uint32_t maxReadWriteImageArgs_;
|
|
|
|
//! Max size of memory object allocation in bytes.
|
|
uint64_t maxMemAllocSize_;
|
|
|
|
//! Max size of system memory allocation in bytes.
|
|
size_t maxPhysicalMemAllocSize_;
|
|
|
|
//! Max width of 2D image in pixels.
|
|
size_t image2DMaxWidth_;
|
|
|
|
//! Max width of 2DA image in pixels.
|
|
size_t image2DAMaxWidth_[2];
|
|
|
|
//! Max height of 2D image in pixels.
|
|
size_t image2DMaxHeight_;
|
|
|
|
//! Max width of 3D image in pixels.
|
|
size_t image3DMaxWidth_;
|
|
|
|
//! Max height of 3D image in pixels.
|
|
size_t image3DMaxHeight_;
|
|
|
|
//! Max depth of 3D image in pixels.
|
|
size_t image3DMaxDepth_;
|
|
|
|
//! Describes whether images are supported
|
|
uint32_t imageSupport_;
|
|
|
|
//! Max size in bytes of the arguments that can be passed to a kernel.
|
|
size_t maxParameterSize_;
|
|
|
|
//! Maximum number of samplers that can be used in a kernel.
|
|
uint32_t maxSamplers_;
|
|
|
|
//! Describes the alignment in bits of the base address of any
|
|
// allocated memory object.
|
|
uint32_t memBaseAddrAlign_;
|
|
|
|
//! The smallest alignment in bytes which can be used for any data type.
|
|
uint32_t minDataTypeAlignSize_;
|
|
|
|
//! Describes single precision floating point capability of the device.
|
|
cl_device_fp_config halfFPConfig_;
|
|
cl_device_fp_config singleFPConfig_;
|
|
cl_device_fp_config doubleFPConfig_;
|
|
|
|
//! Type of global memory cache supported.
|
|
cl_device_mem_cache_type globalMemCacheType_;
|
|
|
|
//! Size of global memory cache line in bytes.
|
|
uint32_t globalMemCacheLineSize_;
|
|
|
|
//! Size of global memory cache in bytes.
|
|
uint64_t globalMemCacheSize_;
|
|
|
|
//! Size of global device memory in bytes.
|
|
uint64_t globalMemSize_;
|
|
|
|
//! Max size in bytes of a constant buffer allocation.
|
|
uint64_t maxConstantBufferSize_;
|
|
|
|
//! Preferred size in bytes of a constant buffer allocation.
|
|
uint64_t preferredConstantBufferSize_;
|
|
|
|
//! Max number of arguments declared
|
|
uint32_t maxConstantArgs_;
|
|
|
|
//! This is used to determine the type of local memory that is available
|
|
cl_device_local_mem_type localMemType_;
|
|
|
|
//! Size of local memory arena in bytes.
|
|
uint64_t localMemSize_;
|
|
|
|
//! If enabled, implies that all the memories, caches, registers etc. in
|
|
// the device implement error correction.
|
|
uint32_t errorCorrectionSupport_;
|
|
|
|
//! true if the device and the host have a unified memory subsystem and
|
|
// is false otherwise.
|
|
uint32_t hostUnifiedMemory_;
|
|
|
|
//! Describes the resolution of device timer.
|
|
size_t profilingTimerResolution_;
|
|
|
|
//! Timer starting point offset to Epoch.
|
|
uint64_t profilingTimerOffset_;
|
|
|
|
//! true if device is a little endian device.
|
|
uint32_t littleEndian_;
|
|
|
|
//! If enabled, implies that commands can be submitted to command-queues
|
|
// created on this device.
|
|
uint32_t available_;
|
|
|
|
//! if the implementation does not have a compiler available to compile
|
|
// the program source.
|
|
uint32_t compilerAvailable_;
|
|
|
|
//! Describes the execution capabilities of the device.
|
|
cl_device_exec_capabilities executionCapabilities_;
|
|
|
|
//! Describes the SVM capabilities of the device.
|
|
cl_device_svm_capabilities svmCapabilities_;
|
|
|
|
//! Preferred alignment for OpenCL fine-grained SVM atomic types.
|
|
uint32_t preferredPlatformAtomicAlignment_;
|
|
|
|
//! Preferred alignment for OpenCL global atomic types.
|
|
uint32_t preferredGlobalAtomicAlignment_;
|
|
|
|
//! Preferred alignment for OpenCL local atomic types.
|
|
uint32_t preferredLocalAtomicAlignment_;
|
|
|
|
//! Describes the command-queue properties supported of the host queue.
|
|
cl_command_queue_properties queueProperties_;
|
|
|
|
//! The platform associated with this device
|
|
cl_platform_id platform_;
|
|
|
|
//! Device name string
|
|
char name_[0x40];
|
|
|
|
//! Vendor name string
|
|
char vendor_[0x20];
|
|
|
|
//! OpenCL software driver version string in the form major.minor
|
|
char driverVersion_[0x20];
|
|
|
|
//! Returns the profile name supported by the device.
|
|
const char* profile_;
|
|
|
|
//! Returns the OpenCL version supported by the device.
|
|
const char* version_;
|
|
|
|
//! The highest OpenCL C version supported by the compiler for this device.
|
|
const char* oclcVersion_;
|
|
|
|
//! Returns a space separated list of extension names.
|
|
const char* extensions_;
|
|
|
|
//! Returns if device linker is available
|
|
uint32_t linkerAvailable_;
|
|
|
|
//! Returns the list of built-in kernels, supported by the device
|
|
const char* builtInKernels_;
|
|
|
|
//! Returns max number of pixels for a 1D image
|
|
size_t image1DMaxWidth_;
|
|
|
|
//! Returns max number of pixels for a 1DA image
|
|
size_t image1DAMaxWidth_;
|
|
|
|
//! Returns max number of pixels for a 1D image created from a buffer object
|
|
size_t imageMaxBufferSize_;
|
|
|
|
//! Returns max number of images in a 1D or 2D image array
|
|
size_t imageMaxArraySize_;
|
|
|
|
//! Returns true if the devices preference is for the user to be
|
|
//! responsible for synchronization
|
|
uint32_t preferredInteropUserSync_;
|
|
|
|
//! Returns maximum size of the internal buffer that holds the output
|
|
//! of printf calls from a kernel
|
|
size_t printfBufferSize_;
|
|
|
|
//! Indicates maximum number of supported global atomic counters
|
|
uint32_t maxAtomicCounters_;
|
|
|
|
//! Returns the topology for the device
|
|
cl_device_topology_amd deviceTopology_;
|
|
|
|
//! Returns PCI Bus Domain ID
|
|
uint32_t pciDomainID;
|
|
|
|
//! Returns sddress of HDP_MEM_COHERENCY_FLUSH_CNTL register
|
|
uint32_t* hdpMemFlushCntl;
|
|
|
|
//! Returns sddress of HDP_REG_COHERENCY_FLUSH_CNTL register
|
|
uint32_t* hdpRegFlushCntl;
|
|
|
|
//! Semaphore information
|
|
uint32_t maxSemaphores_;
|
|
uint32_t maxSemaphoreSize_;
|
|
|
|
//! Returns the SKU board name for the device
|
|
char boardName_[128];
|
|
|
|
//! Number of SIMD (Single Instruction Multiple Data) units per compute unit
|
|
//! that execute in parallel. All work items from the same work group must be
|
|
//! executed by SIMDs in the same compute unit.
|
|
uint32_t simdPerCU_;
|
|
uint32_t cuPerShaderArray_; //!< Number of CUs per shader array
|
|
//! The maximum number of work items from the same work group that can be
|
|
//! executed by a SIMD in parallel
|
|
uint32_t simdWidth_;
|
|
//! The number of instructions that a SIMD can execute in parallel
|
|
uint32_t simdInstructionWidth_;
|
|
//! The number of workitems per wavefront
|
|
uint32_t wavefrontWidth_;
|
|
//! Available number of SGPRs
|
|
uint32_t availableSGPRs_;
|
|
//! Available number of VGPRs
|
|
uint32_t availableVGPRs_;
|
|
//! Available number of registers per CU
|
|
uint32_t availableRegistersPerCU_;
|
|
//! Number of global memory channels
|
|
uint32_t globalMemChannels_;
|
|
//! Number of banks in each global memory channel
|
|
uint32_t globalMemChannelBanks_;
|
|
//! Width in bytes of each of global memory bank
|
|
uint32_t globalMemChannelBankWidth_;
|
|
//! Local memory size per CU
|
|
uint32_t localMemSizePerCU_;
|
|
//! Number of banks of local memory
|
|
uint32_t localMemBanks_;
|
|
//! Number of available async queues
|
|
uint32_t numAsyncQueues_;
|
|
//! Number of available real time queues
|
|
uint32_t numRTQueues_;
|
|
//! Number of available real time compute units
|
|
uint32_t numRTCUs_;
|
|
//! The granularity at which compute units can be dedicated to a queue
|
|
uint32_t granularityRTCUs_;
|
|
//! Thread trace enable
|
|
uint32_t threadTraceEnable_;
|
|
|
|
//! Image pitch alignment for image2d_from_buffer
|
|
uint32_t imagePitchAlignment_;
|
|
//! Image base address alignment for image2d_from_buffer
|
|
uint32_t imageBaseAddressAlignment_;
|
|
|
|
//! Describes whether buffers from images are supported
|
|
uint32_t bufferFromImageSupport_;
|
|
|
|
//! Returns the supported SPIR versions for the device
|
|
const char* spirVersions_;
|
|
|
|
//! OpenCL20 device info fields:
|
|
|
|
//! The max number of pipe objects that can be passed as arguments to a kernel
|
|
uint32_t maxPipeArgs_;
|
|
//! The max number of reservations that can be active for a pipe per work-item in a kernel
|
|
uint32_t maxPipeActiveReservations_;
|
|
//! The max size of pipe packet in bytes
|
|
uint32_t maxPipePacketSize_;
|
|
|
|
//! The command-queue properties supported of the device queue.
|
|
cl_command_queue_properties queueOnDeviceProperties_;
|
|
//! The preferred size of the device queue in bytes
|
|
uint32_t queueOnDevicePreferredSize_;
|
|
//! The max size of the device queue in bytes
|
|
uint32_t queueOnDeviceMaxSize_;
|
|
//! The maximum number of device queues
|
|
uint32_t maxOnDeviceQueues_;
|
|
//! The maximum number of events in use on a device queue
|
|
uint32_t maxOnDeviceEvents_;
|
|
|
|
//! The maximum size of global scope variables
|
|
size_t maxGlobalVariableSize_;
|
|
size_t globalVariablePreferredTotalSize_;
|
|
//! Driver store location
|
|
char driverStore_[200];
|
|
//! Device ID
|
|
uint32_t pcieDeviceId_;
|
|
//! PCI Revision ID
|
|
uint32_t pcieRevisionId_;
|
|
//! ASIC Revision
|
|
uint32_t asicRevision_;
|
|
//! Returns the unique identifier for the device
|
|
char uuid_[16];
|
|
//! Max numbers of threads per CU
|
|
uint32_t maxThreadsPerCU_;
|
|
|
|
//! GPU device supports a launch of cooperative groups
|
|
uint32_t cooperativeGroups_;
|
|
//! GPU device supports a launch of cooperative groups on multiple devices
|
|
uint32_t cooperativeMultiDeviceGroups_;
|
|
|
|
//! large bar support.
|
|
bool largeBar_;
|
|
|
|
uint32_t hmmSupported_; //!< ROCr supports HMM interfaces
|
|
uint32_t hmmCpuMemoryAccessible_; //!< CPU memory is accessible by GPU without pinning/register
|
|
uint32_t hmmDirectHostAccess_; //!< HMM memory is accessible from the host without migration
|
|
|
|
//! global CU mask which will be applied to all queues created on this device
|
|
std::vector<uint32_t> globalCUMask_;
|
|
|
|
//! AQL Barrier Value Packet support
|
|
bool aqlBarrierValue_;
|
|
|
|
bool pcie_atomics_; //!< Pcie atomics support flag
|
|
|
|
bool virtualMemoryManagement_; //!< Virtual memory management support
|
|
size_t virtualMemAllocGranularity_; //!< virtual memory allocation size/addr granularity
|
|
|
|
uint32_t driverNodeId_;
|
|
//! Number of Physical SGPRs per SIMD
|
|
uint32_t sgprsPerSimd_;
|
|
//! Number of VGPRs per SIMD
|
|
uint32_t vgprsPerSimd_;
|
|
uint32_t vgprAllocGranularity_;
|
|
uint32_t numSDMAengines_; //!< Number of available SDMA engines
|
|
|
|
uint32_t luidLowPart_; //!< Luid low 4 bytes, available in Windows only
|
|
uint32_t luidHighPart_; //!< Luid high 4 bytes, available in Windows only
|
|
uint32_t luidDeviceNodeMask_; //!< Luid node mask
|
|
};
|
|
|
|
//! Device settings
|
|
class Settings : public amd::HeapObject {
|
|
public:
|
|
uint64_t extensions_; //!< Supported OCL extensions
|
|
union {
|
|
struct {
|
|
uint apuSystem_ : 1; //!< Device is APU system with shared memory
|
|
uint supportRA_ : 1; //!< Support RA channel order format
|
|
uint waitCommand_ : 1; //!< Enables a wait for every submitted command
|
|
uint customHostAllocator_ : 1; //!< True if device has custom host allocator
|
|
// that replaces generic OS allocation routines
|
|
uint supportDepthsRGB_ : 1; //!< Support DEPTH and sRGB channel order format
|
|
uint reportFMAF_ : 1; //!< Report FP_FAST_FMAF define in CL program
|
|
uint reportFMA_ : 1; //!< Report FP_FAST_FMA define in CL program
|
|
uint singleFpDenorm_ : 1; //!< Support Single FP Denorm
|
|
uint hsailExplicitXnack_ : 1; //!< Xnack in hsail path for this device
|
|
uint useLightning_ : 1; //!< Enable LC path for this device
|
|
uint enableWgpMode_ : 1; //!< Enable WGP mode for this device
|
|
uint enableWave32Mode_ : 1; //!< Enable Wave32 mode for this device
|
|
uint lcWavefrontSize64_ : 1; //!< Enable Wave64 mode for this device
|
|
uint enableXNACK_ : 1; //!< Enable XNACK feature
|
|
uint enableCoopGroups_ : 1; //!< Enable cooperative groups feature
|
|
uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device
|
|
uint fenceScopeAgent_ : 1; //!< Enable fence scope agent in AQL dispatch packet
|
|
uint rocr_backend_ : 1; //!< Device uses ROCr backend for submissions
|
|
uint gwsInitSupported_:1; //!< Check if GWS is supported on this machine.
|
|
uint reserved_ : 10;
|
|
};
|
|
uint value_;
|
|
};
|
|
|
|
//! Default constructor
|
|
Settings();
|
|
|
|
//! Virtual destructor as this class is used as a base class and is also used
|
|
//! to delete the derived classes.
|
|
virtual ~Settings() {};
|
|
|
|
//! Check the specified extension
|
|
bool checkExtension(uint name) const {
|
|
return (extensions_ & (static_cast<uint64_t>(1) << name)) ? true : false;
|
|
}
|
|
|
|
//! Enable the specified extension
|
|
void enableExtension(uint name) { extensions_ |= static_cast<uint64_t>(1) << name; }
|
|
|
|
private:
|
|
//! Disable copy constructor
|
|
Settings(const Settings&);
|
|
|
|
//! Disable assignment
|
|
Settings& operator=(const Settings&);
|
|
};
|
|
|
|
//! Device-independent cache memory, base class for the device-specific
|
|
//! memories. One Memory instance refers to one or more of these.
|
|
class Memory : public amd::HeapObject {
|
|
public:
|
|
//! Resource map flags
|
|
enum CpuMapFlags {
|
|
CpuReadWrite = 0x00000000, //!< Lock for CPU read/Write
|
|
CpuReadOnly = 0x00000001, //!< Lock for CPU read only operation
|
|
CpuWriteOnly = 0x00000002, //!< Lock for CPU write only operation
|
|
};
|
|
|
|
union SyncFlags {
|
|
struct {
|
|
uint skipParent_ : 1; //!< Skip parent synchronization
|
|
uint skipViews_ : 1; //!< Skip views synchronization
|
|
uint skipEntire_ : 1; //!< Skip entire synchronization
|
|
};
|
|
uint value_;
|
|
SyncFlags() : value_(0) {}
|
|
};
|
|
|
|
struct WriteMapInfo : public amd::HeapObject {
|
|
amd::Coord3D origin_; //!< Origin of the map location
|
|
amd::Coord3D region_; //!< Mapped region
|
|
amd::Image* baseMip_; //!< The base mip level for images
|
|
union {
|
|
struct {
|
|
uint32_t count_ : 8; //!< The same map region counter
|
|
uint32_t unmapWrite_ : 1; //!< Unmap write operation
|
|
uint32_t unmapRead_ : 1; //!< Unmap read operation
|
|
uint32_t entire_ : 1; //!< Process the entire memory
|
|
};
|
|
uint32_t flags_;
|
|
};
|
|
|
|
//! Returns the state of entire map
|
|
bool isEntire() const { return (entire_) ? true : false; }
|
|
|
|
//! Returns the state of map write flag
|
|
bool isUnmapWrite() const { return (unmapWrite_) ? true : false; }
|
|
|
|
//! Returns the state of map read flag
|
|
bool isUnmapRead() const { return (unmapRead_) ? true : false; }
|
|
|
|
WriteMapInfo() : origin_(0, 0, 0), region_(0, 0, 0), baseMip_(NULL), flags_(0) {}
|
|
};
|
|
|
|
//! Constructor (from an amd::Memory object).
|
|
Memory(amd::Memory& owner)
|
|
: flags_(0), owner_(&owner), version_(0), mapMemory_(NULL), indirectMapCount_(0) {
|
|
size_ = owner.getSize();
|
|
}
|
|
|
|
//! Constructor (no owner), always eager allocation.
|
|
Memory(size_t size)
|
|
: flags_(0), owner_(NULL), version_(0), mapMemory_(NULL), indirectMapCount_(0), size_(size) {}
|
|
|
|
enum GLResourceOP {
|
|
GLDecompressResource = 0, // orders the GL driver to decompress any depth-stencil or MSAA
|
|
// resource to be sampled by a CL kernel.
|
|
GLInvalidateFBO // orders the GL driver to invalidate any FBO the resource may be bound to,
|
|
// since the resource internal state changed.
|
|
};
|
|
|
|
//! Default destructor for the device memory object
|
|
virtual ~Memory(){};
|
|
|
|
//! Releases virtual objects associated with this memory
|
|
void releaseVirtual();
|
|
|
|
//! Read the size
|
|
size_t size() const { return size_; }
|
|
|
|
//! Gets the owner Memory instance
|
|
amd::Memory* owner() const { return owner_; }
|
|
|
|
//! Immediate blocking write from device cache to owners's backing store.
|
|
//! Marks owner as "current" by resetting the last writer to NULL.
|
|
virtual void syncHostFromCache(device::VirtualDevice* vDev,
|
|
SyncFlags syncFlags = SyncFlags()) {}
|
|
|
|
//! Allocate memory for API-level maps
|
|
virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory
|
|
const amd::Coord3D& region, //!< The map region in memory
|
|
uint mapFlags, //!< Map flags
|
|
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
|
|
size_t* slicePitch = NULL //!< Slice for the mapped memory
|
|
) {
|
|
return NULL;
|
|
}
|
|
|
|
bool isPersistentMapped() const { return (flags_ & PersistentMap) ? true : false; }
|
|
void setPersistentMapFlag(bool persistentMapped) {
|
|
if (persistentMapped == true) {
|
|
flags_ |= PersistentMap;
|
|
}
|
|
else {
|
|
flags_ &= ~PersistentMap;
|
|
}
|
|
}
|
|
|
|
virtual bool pinSystemMemory(void* hostPtr, //!< System memory address
|
|
size_t size //!< Size of allocated system memory
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
//! Releases indirect map surface
|
|
virtual void releaseIndirectMap() {}
|
|
//! decompress any MSAA/depth-stencil interop surfaces.
|
|
//! notify GL to invalidate any surfaces touched by a CL kernel
|
|
virtual bool processGLResource(GLResourceOP operation) { return false; }
|
|
|
|
//! Map the device memory to CPU visible
|
|
virtual void* cpuMap(VirtualDevice& vDev, //!< Virtual device for map operaiton
|
|
uint flags = 0, //!< flags for the map operation
|
|
// Optimization for multilayer map/unmap
|
|
uint startLayer = 0, //!< Start layer for multilayer map
|
|
uint numLayers = 0, //!< End layer for multilayer map
|
|
size_t* rowPitch = NULL, //!< Row pitch for the device memory
|
|
size_t* slicePitch = NULL //!< Slice pitch for the device memory
|
|
) {
|
|
amd::Image* image = owner()->asImage();
|
|
if (image != NULL) {
|
|
*rowPitch = image->getRowPitch();
|
|
*slicePitch = image->getSlicePitch();
|
|
}
|
|
// Default behavior uses preallocated host mem for CPU
|
|
return owner()->getHostMem();
|
|
}
|
|
|
|
//! Unmap the device memory
|
|
virtual void cpuUnmap(VirtualDevice& vDev //!< Virtual device for unmap operaiton
|
|
) {}
|
|
|
|
//! Saves map info for this object
|
|
//! @note: It's not a thread safe operation, the app must implement
|
|
//! synchronization for the multiple write maps if necessary
|
|
void saveMapInfo(const void* mapAddress, //!< Map cpu address
|
|
const amd::Coord3D origin, //!< Origin of the map location
|
|
const amd::Coord3D region, //!< Mapped region
|
|
uint mapFlags, //!< Map flags
|
|
bool entire, //!< True if the enitre memory was mapped
|
|
amd::Image* baseMip = nullptr //!< The base mip level for map
|
|
);
|
|
|
|
const WriteMapInfo* writeMapInfo(const void* mapAddress) const {
|
|
// Unmap must be serialized.
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
|
|
auto it = writeMapInfo_.find(mapAddress);
|
|
if (it == writeMapInfo_.end()) {
|
|
if (writeMapInfo_.size() == 0) {
|
|
LogError("Unmap is a NOP!");
|
|
return nullptr;
|
|
}
|
|
LogWarning("Unknown unmap signature!");
|
|
// Get the first map info
|
|
it = writeMapInfo_.begin();
|
|
}
|
|
return &it->second;
|
|
}
|
|
|
|
//! Clear memory object as mapped read only
|
|
void clearUnmapInfo(const void* mapAddress) {
|
|
// Unmap must be serialized.
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
auto it = writeMapInfo_.find(mapAddress);
|
|
if (it == writeMapInfo_.end()) {
|
|
// Get the first map info
|
|
it = writeMapInfo_.begin();
|
|
}
|
|
if (--it->second.count_ == 0) {
|
|
writeMapInfo_.erase(it);
|
|
}
|
|
}
|
|
|
|
//! Returns the state of memory direct access flag
|
|
bool isHostMemDirectAccess() const { return (flags_ & HostMemoryDirectAccess) ? true : false; }
|
|
|
|
//! Returns the state of host memory registration flag
|
|
bool isHostMemoryRegistered() const { return (flags_ & HostMemoryRegistered) ? true : false; }
|
|
|
|
//! Returns the state of CPU uncached access
|
|
bool isCpuUncached() const { return (flags_ & MemoryCpuUncached) ? true : false; }
|
|
|
|
virtual uint64_t virtualAddress() const { return 0; }
|
|
|
|
virtual uint64_t originalDeviceAddress() const { return 0; }
|
|
|
|
//! Returns CPU pointer to HW state
|
|
virtual const address cpuSrd() const { return nullptr; }
|
|
|
|
//! Returns an export handle for the interprocess communication
|
|
virtual bool ExportHandle(void* handle) const { return false; }
|
|
|
|
bool getAllowedPeerAccess() const { return (flags_ & AllowedPeerAccess) ? true : false; }
|
|
void setAllowedPeerAccess(bool flag) {
|
|
if (flag == true) {
|
|
flags_ |= AllowedPeerAccess;
|
|
}
|
|
else {
|
|
flags_ &= ~AllowedPeerAccess;
|
|
}
|
|
}
|
|
|
|
protected:
|
|
enum Flags {
|
|
HostMemoryDirectAccess = 0x00000001, //!< GPU has direct access to the host memory
|
|
MapResourceAlloced = 0x00000002, //!< Map resource was allocated
|
|
PinnedMemoryAlloced = 0x00000004, //!< An extra pinned resource was allocated
|
|
SubMemoryObject = 0x00000008, //!< Memory is sub-memory
|
|
HostMemoryRegistered = 0x00000010, //!< Host memory was registered
|
|
MemoryCpuUncached = 0x00000020, //!< Memory is uncached on CPU access(slow read)
|
|
AllowedPeerAccess = 0x00000040, //!< Memory can be accessed from peer
|
|
PersistentMap = 0x00000080 //!< Map Peristent memory
|
|
};
|
|
uint flags_; //!< Memory object flags
|
|
|
|
amd::Memory* owner_; //!< The Memory instance that we cache,
|
|
//!< or NULL if we're device-private workspace.
|
|
|
|
volatile size_t version_; //!< The version we're currently shadowing
|
|
|
|
//! NB, the map data below is for an API-level map (from clEnqueueMapBuffer),
|
|
//! not a physical map. When a memory object does not use USE_HOST_PTR we
|
|
//! can use a remote resource and DMA, avoiding the additional CPU memcpy.
|
|
amd::Memory* mapMemory_; //!< Memory used as map target buffer
|
|
volatile size_t indirectMapCount_; //!< Number of maps
|
|
std::unordered_map<const void*, WriteMapInfo>
|
|
writeMapInfo_; //!< Saved write map info for partial unmap
|
|
|
|
//! Increment map count
|
|
void incIndMapCount() { ++indirectMapCount_; }
|
|
|
|
//! Decrement map count
|
|
virtual void decIndMapCount() {}
|
|
|
|
size_t size_; //!< Memory size
|
|
|
|
private:
|
|
//! Disable default copy constructor
|
|
Memory& operator=(const Memory&) = delete;
|
|
|
|
//! Disable operator=
|
|
Memory(const Memory&) = delete;
|
|
};
|
|
|
|
class Sampler : public amd::HeapObject {
|
|
public:
|
|
//! Constructor
|
|
Sampler() : hwSrd_(0), hwState_(nullptr) {}
|
|
|
|
//! Default destructor for the device memory object
|
|
virtual ~Sampler(){};
|
|
|
|
//! Returns device specific HW state for the sampler
|
|
uint64_t hwSrd() const { return hwSrd_; }
|
|
|
|
//! Returns CPU pointer to HW state
|
|
const address hwState() const { return hwState_; }
|
|
|
|
protected:
|
|
uint64_t hwSrd_; //!< Device specific HW state for the sampler
|
|
address hwState_; //!< CPU pointer to HW state
|
|
|
|
private:
|
|
//! Disable default copy constructor
|
|
Sampler& operator=(const Sampler&);
|
|
|
|
//! Disable operator=
|
|
Sampler(const Sampler&);
|
|
};
|
|
|
|
class ClBinary : public amd::HeapObject {
|
|
public:
|
|
enum BinaryImageFormat {
|
|
BIF_VERSION2 = 0, //!< Binary Image Format version 2.0 (ELF)
|
|
BIF_VERSION3 //!< Binary Image Format version 3.0 (ELF)
|
|
};
|
|
|
|
//! Constructor
|
|
ClBinary(const amd::Device& dev, BinaryImageFormat bifVer = BIF_VERSION3);
|
|
|
|
//! Destructor
|
|
virtual ~ClBinary();
|
|
|
|
void init(amd::option::Options* optionsObj);
|
|
|
|
/** called only in loading image routines, never storing routines */
|
|
bool setBinary(const char* theBinary, size_t theBinarySize, bool allocated = false,
|
|
amd::Os::FileDesc fd = amd::Os::FDescInit(), size_t foffset = 0,
|
|
std::string uri = std::string());
|
|
|
|
//! setin elfIn_
|
|
bool setElfIn();
|
|
void resetElfIn();
|
|
|
|
//! set out elf
|
|
bool setElfOut(unsigned char eclass, const char* outFile, bool tempFile);
|
|
void resetElfOut();
|
|
|
|
//! Set elf header information
|
|
virtual bool setElfTarget();
|
|
|
|
// class used in for loading images in new format
|
|
amd::Elf* elfIn() { return elfIn_; }
|
|
|
|
// classes used storing and loading images in new format
|
|
amd::Elf* elfOut() { return elfOut_; }
|
|
void elfOut(amd::Elf* v) { elfOut_ = v; }
|
|
|
|
//! Create and save ELF binary image
|
|
bool createElfBinary(bool doencrypt, Program::type_t type);
|
|
|
|
// save BIF binary image
|
|
void saveBIFBinary(const char* binaryIn, size_t size);
|
|
|
|
bool decryptElf(const char* binaryIn, size_t size, char** decryptBin, size_t* decryptSize,
|
|
int* encryptCode);
|
|
|
|
//! Returns the binary pair, fdesc pair, uri for the abstraction layer
|
|
Program::binary_t data() const;
|
|
Program::finfo_t Datafd() const;
|
|
std::string DataURI() const;
|
|
|
|
//! Loads llvmir binary from OCL binary file
|
|
bool loadLlvmBinary(
|
|
std::string& llvmBinary, //!< LLVMIR binary code
|
|
amd::Elf::ElfSections& elfSectionType //!< LLVMIR binary is in SPIR format
|
|
) const;
|
|
|
|
//! Loads compile options from OCL binary file
|
|
bool loadCompileOptions(std::string& compileOptions //!< return the compile options loaded
|
|
) const;
|
|
|
|
//! Loads link options from OCL binary file
|
|
bool loadLinkOptions(std::string& linkOptions //!< return the link options loaded
|
|
) const;
|
|
|
|
//! Store compile options into OCL binary file
|
|
void storeCompileOptions(const std::string& compileOptions //!< the compile options to be stored
|
|
);
|
|
|
|
//! Store link options into OCL binary file
|
|
void storeLinkOptions(const std::string& linkOptions //!< the link options to be stored
|
|
);
|
|
|
|
//! Check if the binary is recompilable
|
|
bool isRecompilable(std::string& llvmBinary, amd::Elf::ElfPlatform thePlatform);
|
|
|
|
void saveOrigBinary(const char* origBinary, size_t origSize) {
|
|
origBinary_ = origBinary;
|
|
origSize_ = origSize;
|
|
}
|
|
|
|
void restoreOrigBinary() {
|
|
if (origBinary_ != NULL) {
|
|
(void)setBinary(origBinary_, origSize_, false);
|
|
}
|
|
}
|
|
|
|
//! Set Binary flags
|
|
void setFlags(int encryptCode);
|
|
|
|
bool saveSOURCE() { return ((flags_ & BinarySourceMask) == BinarySaveSource); }
|
|
bool saveLLVMIR() { return ((flags_ & BinaryLlvmirMask) == BinarySaveLlvmir); }
|
|
bool saveISA() { return ((flags_ & BinaryIsaMask) == BinarySaveIsa); }
|
|
|
|
bool saveAS() { return ((flags_ & BinaryASMask) == BinarySaveAS); }
|
|
|
|
// Return the encrypt code for this input binary ( "> 0" means encrypted)
|
|
int getEncryptCode() { return encryptCode_; }
|
|
|
|
// Returns TRUE of binary file is SPIR
|
|
bool isSPIR() const;
|
|
// Returns TRUE of binary file is SPIRV
|
|
bool isSPIRV() const;
|
|
|
|
protected:
|
|
enum Flags {
|
|
BinaryAllocated = 0x1, //!< Binary was created
|
|
|
|
// Source control
|
|
BinaryNoSaveSource = 0x0, // 0: default
|
|
BinaryRemoveSource = 0x2, // for encrypted binary
|
|
BinarySaveSource = 0x4,
|
|
BinarySourceMask = 0x6,
|
|
|
|
// LLVMIR control
|
|
BinarySaveLlvmir = 0x0, // 0: default
|
|
BinaryRemoveLlvmir = 0x8, // for encrypted binary
|
|
BinaryNoSaveLlvmir = 0x10,
|
|
BinaryLlvmirMask = 0x18,
|
|
|
|
// ISA control
|
|
BinarySaveIsa = 0x0, // 0: default
|
|
BinaryRemoveIsa = 0x80, // for encrypted binary
|
|
BinaryNoSaveIsa = 0x100,
|
|
BinaryIsaMask = 0x180,
|
|
|
|
// AS control
|
|
BinaryNoSaveAS = 0x0, // 0: default
|
|
BinaryRemoveAS = 0x200, // for encrypted binary
|
|
BinarySaveAS = 0x400,
|
|
BinaryASMask = 0x600
|
|
};
|
|
|
|
//! Returns TRUE if binary file was allocated
|
|
bool isBinaryAllocated() const { return (flags_ & BinaryAllocated) ? true : false; }
|
|
|
|
#if defined(WITH_COMPILER_LIB)
|
|
//! Returns BIF symbol name by symbolID,
|
|
//! returns empty string if not found or if BIF version is unsupported
|
|
std::string getBIFSymbol(unsigned int symbolID) const;
|
|
#endif
|
|
|
|
protected:
|
|
const amd::Device& dev_; //!< Device object
|
|
|
|
private:
|
|
//! Disable default copy constructor
|
|
ClBinary(const ClBinary&);
|
|
|
|
//! Disable default operator=
|
|
ClBinary& operator=(const ClBinary&);
|
|
|
|
//! Releases the binary data store
|
|
void release();
|
|
|
|
const char* binary_; //!< binary data
|
|
size_t size_; //!< binary size
|
|
uint flags_; //!< CL binary object flags
|
|
|
|
amd::Os::FileDesc fdesc_; //!< file descriptor
|
|
size_t foffset_; //!< file offset
|
|
std::string uri_; //!< memory URI
|
|
|
|
const char* origBinary_; //!< original binary data
|
|
size_t origSize_; //!< original binary size
|
|
|
|
int encryptCode_; //!< Encryption Code for input binary (0 for not encrypted)
|
|
|
|
std::string fname_; //!< name of elf dump file
|
|
bool tempFile_; //!< Is the elf dump file temporary
|
|
|
|
protected:
|
|
amd::Elf* elfIn_; //!< ELF object for input ELF binary
|
|
amd::Elf* elfOut_; //!< ELF object for output ELF binary
|
|
BinaryImageFormat format_; //!< which binary image format to use
|
|
};
|
|
|
|
inline const Program::binary_t Program::binary() const {
|
|
if (clBinary() == NULL) {
|
|
return {(const void*)0, 0};
|
|
}
|
|
return clBinary()->data();
|
|
}
|
|
|
|
inline std::string Program::BinaryURI() const {
|
|
if (clBinary() == NULL) {
|
|
return std::string();
|
|
}
|
|
return clBinary()->DataURI();
|
|
}
|
|
|
|
inline Program::finfo_t Program::BinaryFd() const {
|
|
if (clBinary() == NULL) {
|
|
return {amd::Os::FDescInit(), 0};
|
|
}
|
|
return clBinary()->Datafd();
|
|
}
|
|
|
|
inline Program::binary_t Program::binary() {
|
|
if (clBinary() == NULL) {
|
|
return {(const void*)0, 0};
|
|
}
|
|
return clBinary()->data();
|
|
}
|
|
|
|
/*! \class PerfCounter
|
|
*
|
|
* \brief The device interface class for the performance counters
|
|
*/
|
|
class PerfCounter : public amd::HeapObject {
|
|
public:
|
|
//! Constructor for the device performance
|
|
PerfCounter() {}
|
|
|
|
//! Get the performance counter info
|
|
virtual uint64_t getInfo(uint64_t infoType) const = 0;
|
|
|
|
//! Destructor for PerfCounter class
|
|
virtual ~PerfCounter() {}
|
|
|
|
private:
|
|
//! Disable default copy constructor
|
|
PerfCounter(const PerfCounter&);
|
|
|
|
//! Disable default operator=
|
|
PerfCounter& operator=(const PerfCounter&);
|
|
};
|
|
/*! \class ThreadTrace
|
|
*
|
|
* \brief The device interface class for the performance counters
|
|
*/
|
|
class ThreadTrace : public amd::HeapObject {
|
|
public:
|
|
//! Constructor for the device performance
|
|
ThreadTrace() {}
|
|
//! Update ThreadTrace status to true/false if new buffer was binded/unbinded respectively
|
|
virtual void setNewBufferBinded(bool) = 0;
|
|
//! Get the performance counter info
|
|
virtual bool info(uint infoType, uint* info, uint infoSize) const = 0;
|
|
//! Destructor for PerfCounter class
|
|
virtual ~ThreadTrace() {}
|
|
|
|
private:
|
|
//! Disable default copy constructor
|
|
ThreadTrace(const ThreadTrace&);
|
|
|
|
//! Disable default operator=
|
|
ThreadTrace& operator=(const ThreadTrace&);
|
|
};
|
|
|
|
//! A device execution environment.
|
|
class VirtualDevice : public amd::HeapObject {
|
|
public:
|
|
//! Construct a new virtual device for the given physical device.
|
|
VirtualDevice(amd::Device& device)
|
|
: device_(device)
|
|
, blitMgr_(NULL)
|
|
, execution_("Virtual device execution lock", true)
|
|
, index_(0) {}
|
|
|
|
//! Destroy this virtual device.
|
|
virtual ~VirtualDevice() {}
|
|
|
|
//! Return the physical device for this virtual device.
|
|
const amd::Device& device() const { return device_(); }
|
|
|
|
virtual void submitReadMemory(amd::ReadMemoryCommand& cmd) = 0;
|
|
virtual void submitWriteMemory(amd::WriteMemoryCommand& cmd) = 0;
|
|
virtual void submitCopyMemory(amd::CopyMemoryCommand& cmd) = 0;
|
|
virtual void submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) = 0;
|
|
virtual void submitMapMemory(amd::MapMemoryCommand& cmd) = 0;
|
|
virtual void submitUnmapMemory(amd::UnmapMemoryCommand& cmd) = 0;
|
|
virtual void submitKernel(amd::NDRangeKernelCommand& command) = 0;
|
|
virtual void submitNativeFn(amd::NativeFnCommand& cmd) = 0;
|
|
virtual void submitMarker(amd::Marker& cmd) = 0;
|
|
virtual void submitAccumulate(amd::AccumulateCommand& cmd) = 0;
|
|
virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) = 0;
|
|
virtual void submitFillMemory(amd::FillMemoryCommand& cmd) = 0;
|
|
virtual void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd) = 0;
|
|
virtual void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd) = 0;
|
|
virtual void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd) = 0;
|
|
virtual void submitPerfCounter(amd::PerfCounterCommand& cmd) = 0;
|
|
virtual void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) = 0;
|
|
virtual void submitThreadTrace(amd::ThreadTraceCommand& cmd) = 0;
|
|
virtual void flush(amd::Command* list = NULL, bool wait = false) = 0;
|
|
virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) = 0;
|
|
virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) = 0;
|
|
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) = 0;
|
|
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) = 0;
|
|
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) = 0;
|
|
/// Optional extensions
|
|
virtual void submitSignal(amd::SignalCommand& cmd) = 0;
|
|
virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) = 0;
|
|
virtual void submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
|
|
ShouldNotReachHere();
|
|
}
|
|
virtual void submitStreamOperation(amd::StreamOperationCommand& cmd) { ShouldNotReachHere(); }
|
|
virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
|
|
|
|
virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
|
|
|
|
//! Get the blit manager object
|
|
device::BlitManager& blitMgr() const { return *blitMgr_; }
|
|
|
|
//! Returns the monitor object for execution access by VirtualGPU
|
|
amd::Monitor& execution() { return execution_; }
|
|
|
|
//! Returns the virtual device unique index
|
|
uint index() const { return index_; }
|
|
|
|
//! Returns true if device has active wait setting
|
|
bool ActiveWait() const;
|
|
|
|
//! Returns the status of queue handler callback
|
|
virtual bool isHandlerPending() const = 0;
|
|
|
|
//! Returns fence state of the VirtualGPU
|
|
virtual bool isFenceDirty() const = 0;
|
|
|
|
//! Dispatch captured AQL packet
|
|
virtual bool dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCommand* vcmd = nullptr) = 0;
|
|
|
|
//! Resets fence state of the VirtualGPU
|
|
virtual void resetFenceDirty() = 0;
|
|
|
|
private:
|
|
//! Disable default copy constructor
|
|
VirtualDevice& operator=(const VirtualDevice&);
|
|
|
|
//! Disable operator=
|
|
VirtualDevice(const VirtualDevice&);
|
|
|
|
//! The physical device that this virtual device utilizes
|
|
amd::SharedReference<amd::Device> device_;
|
|
|
|
protected:
|
|
device::BlitManager* blitMgr_; //!< Blit manager
|
|
|
|
amd::Monitor execution_; //!< Lock to serialise access to all device objects
|
|
uint index_; //!< The virtual device unique index
|
|
};
|
|
|
|
} // namespace device
|
|
|
|
namespace amd {
|
|
|
|
//! MemoryObject map lookup class
|
|
class MemObjMap : public AllStatic {
|
|
public:
|
|
static size_t size(); //!< obtain the size of the container
|
|
static void AddMemObj(const void* k,
|
|
amd::Memory* v); //!< add the host mem pointer and buffer in the container
|
|
static void RemoveMemObj(const void* k); //!< Remove an entry of mem object from the container
|
|
static amd::Memory* FindMemObj(
|
|
const void* k, //!< find the mem object based on the input pointer
|
|
size_t* offset = nullptr); //!< Offset in the memory location
|
|
static void UpdateAccess(amd::Device *peerDev);
|
|
static void Purge(amd::Device* dev); //!< Purge all user allocated memories on the given device
|
|
|
|
static void AddVirtualMemObj(const void* k,
|
|
amd::Memory* v); //!< Same as AddMemObj but for virtual addressing
|
|
static void RemoveVirtualMemObj(const void* k); //!< Same as RemoveMemObj but for virtual addressing
|
|
static amd::Memory* FindVirtualMemObj(
|
|
const void* k); //!< Same as FindMemObj but for virtual addressing
|
|
private:
|
|
static std::map<uintptr_t, amd::Memory*>
|
|
MemObjMap_; //!< the mem object<->hostptr information container
|
|
static std::map<uintptr_t, amd::Memory*>
|
|
VirtualMemObjMap_; //!< the virtual mem object<->hostptr information container
|
|
static amd::Monitor AllocatedLock_; //!< amd monitor locker
|
|
};
|
|
|
|
/// @brief Instruction Set Architecture properties.
|
|
class Isa {
|
|
public:
|
|
|
|
/// @brief Isa's target feature setting type.
|
|
enum class Feature : uint8_t {
|
|
Unsupported,
|
|
Any,
|
|
Disabled,
|
|
Enabled,
|
|
};
|
|
|
|
//! Return a non-zero uint64_t value that uniquely identifies the device.
|
|
//! This can be used when a scalar value handle to the device is require.
|
|
static uint64_t toHandle(const Isa *isa) {
|
|
static_assert(sizeof(isa) <= sizeof(uint64_t), "Handle size does not match pointer size");
|
|
assert((reinterpret_cast<uint64_t>(static_cast<const Isa*>(nullptr)) == 0) &&
|
|
"nullptr value is not 0");
|
|
return isa ? reinterpret_cast<uint64_t>(isa) : 0;
|
|
}
|
|
|
|
//! Return the device corresponding to a handle returned by Isa::handle,
|
|
//! or nullptr if the handle is 0. This can be used when a scalar value
|
|
//! handle for a device is provided.
|
|
static const Isa* fromHandle(uint64_t handle) {
|
|
static_assert(sizeof(handle) <= sizeof(uint64_t), "Handle size does not match pointer size");
|
|
assert((reinterpret_cast<uint64_t>(static_cast<const Isa*>(nullptr)) == 0) &&
|
|
"nullptr value is not 0");
|
|
return handle ? reinterpret_cast<const Isa*>(handle) : nullptr;
|
|
}
|
|
|
|
/// @returns This Isa's target triple and target ID name.
|
|
std::string isaName() const;
|
|
|
|
/// @returns This Isa's processor name.
|
|
std::string processorName() const;
|
|
|
|
/// @returns This Isa's target ID name.
|
|
const char *targetId() const {
|
|
return targetId_;
|
|
}
|
|
|
|
/// @returns This Isa's name to use with the HSAIL compiler.
|
|
const char *hsailName() const {
|
|
return hsailId_;
|
|
}
|
|
|
|
/// @returns If the ROCm runtime supports the ISA.
|
|
bool runtimeRocSupported() const {
|
|
if (!IS_HIP && (versionMajor_ == 8)) {
|
|
return false;
|
|
}
|
|
return runtimeRocSupported_;
|
|
}
|
|
|
|
/// @returns If the PAL runtime supports the ISA.
|
|
bool runtimePalSupported() const {
|
|
if (IS_LINUX && (GPU_ENABLE_PAL == 2) && (versionMajor_ >= 9)) {
|
|
return false;
|
|
}
|
|
return runtimePalSupported_;
|
|
}
|
|
|
|
/// @returns SRAM ECC feature status.
|
|
const Feature &sramecc() const {
|
|
return sramecc_;
|
|
}
|
|
|
|
/// @returns XNACK feature status.
|
|
const Feature &xnack() const {
|
|
return xnack_;
|
|
}
|
|
|
|
/// @returns True if SRAMECC feature is supported, false otherwise.
|
|
bool isSrameccSupported() const {
|
|
return sramecc_ != Feature::Unsupported;
|
|
}
|
|
|
|
/// @returns True if XNACK feature is supported, false otherwise.
|
|
bool isXnackSupported() const {
|
|
return xnack_ != Feature::Unsupported;
|
|
}
|
|
|
|
/// @returns This Isa's major version.
|
|
uint32_t versionMajor() const {
|
|
return versionMajor_;
|
|
}
|
|
|
|
/// @returns This Isa's minor version.
|
|
uint32_t versionMinor() const {
|
|
return versionMinor_;
|
|
}
|
|
|
|
/// @returns This Isa's stepping version.
|
|
uint32_t versionStepping() const {
|
|
return versionStepping_;
|
|
}
|
|
|
|
/// @returns This Isa's number of SIMDs per CU.
|
|
uint32_t simdPerCU() const {
|
|
return simdPerCU_;
|
|
}
|
|
|
|
/// @returns This Isa's
|
|
uint32_t simdWidth() const {
|
|
return simdWidth_;
|
|
}
|
|
|
|
/// @returns This Isa's number of instructions processed per SIMD.
|
|
uint32_t simdInstructionWidth() const {
|
|
return simdInstructionWidth_;
|
|
}
|
|
|
|
/// @returns This Isa's memory channel bank width.
|
|
uint32_t memChannelBankWidth() const {
|
|
return memChannelBankWidth_;
|
|
}
|
|
|
|
/// @returns This Isa's local memory size per CU.
|
|
uint32_t localMemSizePerCU() const {
|
|
return localMemSizePerCU_;
|
|
}
|
|
|
|
/// @returns This Isa's number of banks of local memory.
|
|
uint32_t localMemBanks() const {
|
|
return localMemBanks_;
|
|
}
|
|
|
|
/// @returns True if @p codeObjectIsa and @p agentIsa are compatible,
|
|
/// false otherwise.
|
|
static bool isCompatible(const Isa &codeObjectIsa, const Isa &agentIsa);
|
|
|
|
/// @returns Isa for requested @p isaName, null pointer if not supported.
|
|
static const Isa* findIsa(const char *isaName);
|
|
|
|
/// @returns Isa for requested @p version, null pointer if not supported.
|
|
static const Isa* findIsa(uint32_t versionMajor, uint32_t versionMinor, uint32_t versionStepping,
|
|
Feature sramecc = Feature::Any, Feature xnack = Feature::Any);
|
|
|
|
/// @returns Iterator for first isa.
|
|
static const Isa* begin();
|
|
|
|
/// @returns Iterator for one past the end isa.
|
|
static const Isa* end();
|
|
|
|
private:
|
|
|
|
constexpr Isa(const char* targetId, const char* hsailId,
|
|
bool runtimeRocSupported, bool runtimePalSupported,
|
|
uint32_t versionMajor, uint32_t versionMinor, uint32_t versionStepping,
|
|
Feature sramecc, Feature xnack, uint32_t simdPerCU, uint32_t simdWidth,
|
|
uint32_t simdInstructionWidth, uint32_t memChannelBankWidth,
|
|
uint32_t localMemSizePerCU, uint32_t localMemBanks)
|
|
: targetId_(targetId),
|
|
hsailId_(hsailId),
|
|
runtimeRocSupported_(runtimeRocSupported),
|
|
runtimePalSupported_(runtimePalSupported),
|
|
versionMajor_(versionMajor),
|
|
versionMinor_(versionMinor),
|
|
versionStepping_(versionStepping),
|
|
sramecc_(sramecc),
|
|
xnack_(xnack),
|
|
simdPerCU_(simdPerCU),
|
|
simdWidth_(simdWidth),
|
|
simdInstructionWidth_(simdInstructionWidth),
|
|
memChannelBankWidth_(memChannelBankWidth),
|
|
localMemSizePerCU_(localMemSizePerCU),
|
|
localMemBanks_(localMemBanks) {}
|
|
|
|
// @brief Returns the begin and end iterators for the suppported ISAs.
|
|
static std::pair<const Isa*, const Isa*> supportedIsas();
|
|
|
|
// @brief Isa's target ID name. Used for LLVM COde Object Manager
|
|
// compilations.
|
|
const char* targetId_;
|
|
|
|
// @brief Isa's HSAIL name. Used for the Compiler Library for HSAIL
|
|
// compilation using the Shader Compiler Finalizer. Empty string if
|
|
// unsupported.
|
|
const char* hsailId_;
|
|
|
|
bool runtimeRocSupported_; //!< ROCm runtime is supported.
|
|
bool runtimePalSupported_; //!< PAL runtime is supported.
|
|
uint32_t versionMajor_; //!< Isa's major version.
|
|
uint32_t versionMinor_; //!< Isa's minor version.
|
|
uint32_t versionStepping_; //!< Isa's stepping version.
|
|
Feature sramecc_; //!< SRAMECC feature.
|
|
Feature xnack_; //!< XNACK feature.
|
|
uint32_t simdPerCU_; //!< Number of SIMDs per CU.
|
|
uint32_t simdWidth_; //!< Number of workitems processed per SIMD.
|
|
uint32_t simdInstructionWidth_; //!< Number of instructions processed per SIMD.
|
|
uint32_t memChannelBankWidth_; //!< Memory channel bank width.
|
|
uint32_t localMemSizePerCU_; //!< Local memory size per CU.
|
|
uint32_t localMemBanks_; //!< Number of banks of local memory.
|
|
}; // class Isa
|
|
|
|
/*! \addtogroup Runtime
|
|
* @{
|
|
*
|
|
* \addtogroup Device Device Abstraction
|
|
* @{
|
|
*/
|
|
class Device : public RuntimeObject {
|
|
protected:
|
|
#if defined(WITH_COMPILER_LIB)
|
|
typedef aclCompiler Compiler;
|
|
#endif
|
|
|
|
public:
|
|
// The structures below for MGPU launch match the device library format
|
|
struct MGSyncData {
|
|
uint32_t w0;
|
|
uint32_t w1;
|
|
};
|
|
|
|
struct MGSyncInfo {
|
|
struct MGSyncData* mgs;
|
|
uint32_t grid_id;
|
|
uint32_t num_grids;
|
|
uint64_t prev_sum;
|
|
uint64_t all_sum;
|
|
struct MGSyncData sgs;
|
|
uint num_wg;
|
|
};
|
|
|
|
//Attributes that could be retrived from hsa_amd_memory_pool_link_info_t.
|
|
typedef enum LinkAttribute {
|
|
kLinkLinkType = 0,
|
|
kLinkHopCount,
|
|
kLinkDistance,
|
|
kLinkAtomicSupport
|
|
} LinkAttribute;
|
|
|
|
typedef enum MemorySegment {
|
|
kNoAtomics = 0,
|
|
kAtomics = 1,
|
|
kKernArg = 2
|
|
} MemorySegment;
|
|
|
|
typedef enum CacheState {
|
|
kCacheStateInvalid = -1,
|
|
kCacheStateIgnore = 0,
|
|
kCacheStateAgent = 1,
|
|
kCacheStateSystem = 2
|
|
} CacheState;
|
|
|
|
//<! Enum describing the access permissions of Virtual memory
|
|
enum class VmmAccess {
|
|
kNone = 0x0,
|
|
kReadOnly = 0x1,
|
|
kWriteOnly = 0x2,
|
|
kReadWrite = 0x3
|
|
};
|
|
|
|
typedef std::pair<LinkAttribute, int32_t /* value */> LinkAttrType;
|
|
|
|
static constexpr size_t kP2PStagingSize = 4 * Mi;
|
|
static constexpr size_t kMGSyncDataSize = sizeof(MGSyncData);
|
|
static constexpr size_t kMGInfoSizePerDevice = kMGSyncDataSize + sizeof(MGSyncInfo);
|
|
static constexpr size_t kSGInfoSize = kMGSyncDataSize;
|
|
|
|
typedef std::list<CommandQueue*> CommandQueues;
|
|
|
|
struct BlitProgram : public amd::HeapObject {
|
|
Program* program_; //!< GPU program object
|
|
Context* context_; //!< A dummy context
|
|
|
|
BlitProgram(Context* context) : program_(NULL), context_(context) {}
|
|
~BlitProgram();
|
|
|
|
//! Creates blit program for this device
|
|
bool create(Device* device, //!< Device object
|
|
const std::string& extraKernel, //!< Extra kernels from the device layer
|
|
const std::string& extraOptions //!< Extra compilation options
|
|
);
|
|
};
|
|
|
|
#if defined(WITH_COMPILER_LIB)
|
|
virtual Compiler* compiler() const = 0;
|
|
virtual Compiler* binCompiler() const { return compiler(); }
|
|
#endif
|
|
|
|
Device();
|
|
virtual ~Device();
|
|
|
|
//! Initializes abstraction layer device object
|
|
bool create(const Isa &isa);
|
|
|
|
uint retain() {
|
|
// Overwrite the RuntimeObject::retain().
|
|
// There is an issue in the old SHOC11_DeviceMemory test on TC
|
|
return 0u;
|
|
}
|
|
|
|
uint release() {
|
|
// Overwrite the RuntimeObject::release().
|
|
// There is an issue in the old SHOC11_DeviceMemory test on TC
|
|
return 0u;
|
|
}
|
|
|
|
//! Register a device as available
|
|
void registerDevice();
|
|
|
|
//! Initialize the device layer (enumerate known devices)
|
|
static bool init();
|
|
|
|
//! Shutdown the device layer
|
|
static void tearDown();
|
|
|
|
static std::vector<Device*> getDevices(cl_device_type type, //!< Device type
|
|
bool offlineDevices //!< Enable offline devices
|
|
);
|
|
|
|
static size_t numDevices(cl_device_type type, //!< Device type
|
|
bool offlineDevices //!< Enable offline devices
|
|
);
|
|
|
|
static bool getDeviceIDs(cl_device_type deviceType, //!< Device type
|
|
uint32_t numEntries, //!< Number of entries in the array
|
|
cl_device_id* devices, //!< Array of the device ID(s)
|
|
uint32_t* numDevices, //!< Number of available devices
|
|
bool offlineDevices //!< Report offline devices
|
|
);
|
|
|
|
const device::Info& info() const { return info_; }
|
|
|
|
//! Return svm support capability.
|
|
bool svmSupport() const {
|
|
return (info().svmCapabilities_ &
|
|
(CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER |
|
|
CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) != 0
|
|
? true
|
|
: false;
|
|
}
|
|
|
|
//! check svm FGS support capability.
|
|
inline bool isFineGrainedSystem(bool FGSOPT = false) const {
|
|
return FGSOPT && (info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) != 0 ? true
|
|
: false;
|
|
}
|
|
|
|
//! Return this device's type.
|
|
cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); }
|
|
|
|
//! Create a new virtual device environment.
|
|
virtual device::VirtualDevice* createVirtualDevice(CommandQueue* queue = NULL) = 0;
|
|
|
|
//! Create a program for device.
|
|
virtual device::Program* createProgram(amd::Program& owner, option::Options* options = NULL) = 0;
|
|
|
|
//! Allocate a chunk of device memory as a cache for a CL memory object
|
|
virtual device::Memory* createMemory(Memory& owner) const = 0;
|
|
|
|
//! Allocate a chunk of device memory without owner class
|
|
virtual device::Memory* createMemory(size_t size) const = 0;
|
|
|
|
//! Allocate a device sampler object
|
|
virtual bool createSampler(const Sampler&, device::Sampler**) const = 0;
|
|
|
|
//! Allocates a view object from the device memory
|
|
virtual device::Memory* createView(
|
|
amd::Memory& owner, //!< Owner memory object
|
|
const device::Memory& parent //!< Parent device memory object for the view
|
|
) const = 0;
|
|
|
|
///! Allocates a device signal object
|
|
virtual device::Signal* createSignal() const = 0;
|
|
|
|
//! Return true if initialized external API interop, otherwise false
|
|
virtual bool bindExternalDevice(
|
|
uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc.
|
|
void* const pDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL
|
|
void* pContext, //!< HGLRC/GLXContext handle
|
|
bool validateOnly //! Only validate if the device can inter-operate with pDevice/pContext, do
|
|
//! not bind.
|
|
) = 0;
|
|
|
|
virtual bool unbindExternalDevice(
|
|
uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc.
|
|
void* const pDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL
|
|
void* pContext, //!< HGLRC/GLXContext handle
|
|
bool validateOnly //! Only validate if the device can inter-operate with pDevice/pContext, do
|
|
//! not bind.
|
|
) = 0;
|
|
|
|
//! resolves GL depth/msaa buffer
|
|
virtual bool resolveGLMemory(device::Memory*) const { return true; }
|
|
|
|
//! Gets free memory on a GPU device
|
|
virtual bool globalFreeMemory(size_t* freeMemory //!< Free memory information on a GPU device
|
|
) const = 0;
|
|
|
|
virtual bool importExtSemaphore(void** extSemaphore, const amd::Os::FileDesc& handle,
|
|
amd::ExternalSemaphoreHandleType sem_handle_type) = 0;
|
|
virtual void DestroyExtSemaphore(void* extSemaphore) = 0;
|
|
/**
|
|
* @return True if the device has its own custom host allocator to be used
|
|
* instead of the generic OS allocation routines
|
|
*/
|
|
bool customHostAllocator() const { return settings().customHostAllocator_ == 1; }
|
|
|
|
/**
|
|
* @copydoc amd::Context::hostAlloc
|
|
*/
|
|
virtual void* hostAlloc(size_t size, size_t alignment,
|
|
MemorySegment mem_seg = kNoAtomics) const {
|
|
ShouldNotCallThis();
|
|
return NULL;
|
|
}
|
|
|
|
virtual void* deviceLocalAlloc(size_t size, bool atomics = false,
|
|
bool pseudo_fine_grain = false) const {
|
|
ShouldNotCallThis();
|
|
return NULL;
|
|
}
|
|
|
|
virtual bool isXgmi() const {
|
|
ShouldNotCallThis();
|
|
return false;
|
|
}
|
|
|
|
virtual bool deviceAllowAccess(void* dst) const {
|
|
ShouldNotCallThis();
|
|
return true;
|
|
}
|
|
|
|
virtual bool allowPeerAccess(device::Memory* memory) const {
|
|
ShouldNotCallThis();
|
|
return true;
|
|
}
|
|
|
|
bool enableP2P(amd::Device* ptrDev);
|
|
|
|
bool disableP2P(amd::Device* ptrDev);
|
|
|
|
/**
|
|
* @copydoc amd::Context::hostFree
|
|
*/
|
|
virtual void hostFree(void* ptr, size_t size = 0) const { ShouldNotCallThis(); }
|
|
|
|
/**
|
|
* @copydoc amd::Context::svmAlloc
|
|
*/
|
|
virtual void* svmAlloc(Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags,
|
|
void* svmPtr) const = 0;
|
|
/**
|
|
* @copydoc amd::Context::svmFree
|
|
*/
|
|
virtual void svmFree(void* ptr) const = 0;
|
|
|
|
/**
|
|
* Reserve a VA range with no backing store
|
|
*
|
|
* @param addr Start address requested
|
|
* @param size Size of the range in bytes
|
|
* @param alignment Alignment in bytes
|
|
*/
|
|
virtual void* virtualAlloc(void* addr, size_t size, size_t alignment) = 0;
|
|
|
|
/**
|
|
* Set Access permisions for a virtual memory object.
|
|
*
|
|
* @param va_addr Virtual Address ptr
|
|
* @param va_size Virtual Address Size
|
|
* @param access_flags Access permissions
|
|
* @param count Number of access permissions
|
|
*/
|
|
virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags,
|
|
size_t count) = 0;
|
|
|
|
/**
|
|
* Get Access permisions for a virtual memory object.
|
|
*
|
|
* @param va_addr Virtual Address ptr
|
|
* @param access_flags_ptr Access permissions to be filled
|
|
*/
|
|
virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) = 0;
|
|
|
|
/**
|
|
* Free a VA range
|
|
*
|
|
* @param addr Start address of the range
|
|
*/
|
|
virtual void virtualFree(void* addr) = 0;
|
|
|
|
/**
|
|
* @return True if the device successfully applied the SVM attributes in HMM for device memory
|
|
*/
|
|
virtual bool SetSvmAttributes(const void* dev_ptr, size_t count,
|
|
amd::MemoryAdvice advice, bool use_cpu = false) const {
|
|
ShouldNotCallThis();
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* @return True if the device successfully retrieved the SVM attributes from HMM for device memory
|
|
*/
|
|
virtual bool GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
|
|
size_t num_attributes, const void* dev_ptr, size_t count) const {
|
|
ShouldNotCallThis();
|
|
return false;
|
|
}
|
|
|
|
//! Validate kernel
|
|
virtual bool validateKernel(const amd::Kernel& kernel,
|
|
const device::VirtualDevice* vdev,
|
|
bool coop_groups = false) {
|
|
return true;
|
|
};
|
|
|
|
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
|
|
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
|
|
return true;
|
|
};
|
|
|
|
// Returns the status of HW event, associated with amd::Event
|
|
virtual bool IsHwEventReady(
|
|
const amd::Event& event, //!< AMD event for HW status validation
|
|
bool wait = false) const { //!< If true then forces the event completion
|
|
return false;
|
|
};
|
|
|
|
// Returns the status of HW event, associated with amd::Event
|
|
virtual bool IsHwEventReadyForcedWait(
|
|
const amd::Event& event) const { //!< AMD event for HW status validation
|
|
return false;
|
|
};
|
|
|
|
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
|
|
|
|
virtual const uint32_t getPreferredNumaNode() const { return 0; }
|
|
virtual void ReleaseGlobalSignal(void* signal) const {}
|
|
virtual const bool isFineGrainSupported() const {
|
|
return (info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS) != 0 ? true : false;
|
|
}
|
|
|
|
//! Returns TRUE if the device is available for computations
|
|
bool isOnline() const { return online_; }
|
|
|
|
//! Returns device isa.
|
|
const Isa &isa() const {
|
|
assert(isa_);
|
|
return *isa_;
|
|
}
|
|
|
|
//! Return a non-zero uint64_t value that uniquely identifies the device.
|
|
//! This can be used when a scalar value handle to the device is require.
|
|
static uint64_t toHandle(const Device *device) {
|
|
static_assert(sizeof(device) <= sizeof(uint64_t), "Handle size does not match pointer size");
|
|
assert((reinterpret_cast<uint64_t>(static_cast<const Device*>(nullptr)) == 0) &&
|
|
"nullptr value is not 0");
|
|
return device ? reinterpret_cast<uint64_t>(device) : 0;
|
|
}
|
|
|
|
//! Return the device corresponding to a handle returned by Device::handle,
|
|
//! or nullptr if the handle is 0. This can be used when a scalar value
|
|
//! handle for a device is provided.
|
|
static const Device* fromHhandle(uint64_t handle) {
|
|
static_assert(sizeof(handle) <= sizeof(uint64_t), "Handle size does not match pointer size");
|
|
assert((reinterpret_cast<uint64_t>(static_cast<const Device*>(nullptr)) == 0) &&
|
|
"nullptr value is not 0");
|
|
return handle ? reinterpret_cast<const Device*>(handle) : nullptr;
|
|
}
|
|
|
|
//! Returns device settings
|
|
const device::Settings& settings() const { return *settings_; }
|
|
|
|
//! Returns blit program info structure
|
|
BlitProgram* blitProgram() const { return blitProgram_; }
|
|
|
|
//! RTTI internal implementation
|
|
virtual ObjectType objectType() const { return ObjectTypeDevice; }
|
|
|
|
//! Returns app profile
|
|
static const AppProfile* appProfile() { return &appProfile_; }
|
|
|
|
//! Adds GPU memory to the VA cache list
|
|
void addVACache(device::Memory* memory) const;
|
|
|
|
//! Removes GPU memory from the VA cache list
|
|
void removeVACache(const device::Memory* memory) const;
|
|
|
|
//! Finds GPU memory from virtual address
|
|
device::Memory* findMemoryFromVA(const void* ptr, size_t* offset) const;
|
|
|
|
static std::vector<Device*>& devices() { return *devices_; }
|
|
|
|
// P2P devices that are accessible from the current device
|
|
std::vector<cl_device_id> p2pDevices_;
|
|
|
|
// P2P devices for memory allocation. This list contains devices that can have access to the
|
|
// current device
|
|
std::vector<Device*> p2p_access_devices_;
|
|
|
|
//! Checks if OCL runtime can use code object manager for compilation
|
|
bool ValidateComgr();
|
|
|
|
//! Checks if OCL runtime can use hsail for compilation
|
|
bool ValidateHsail();
|
|
|
|
bool IpcCreate(void* dev_ptr, size_t* mem_size, void* handle, size_t* mem_offset) const;
|
|
|
|
bool IpcAttach(const void* handle, size_t mem_size, size_t mem_offset, unsigned int flags,
|
|
void** dev_ptr) const;
|
|
|
|
bool IpcDetach(void* dev_ptr) const;
|
|
|
|
//! Return context
|
|
amd::Context& context() const { return *context_; }
|
|
|
|
//! Return private global device context for P2P allocations
|
|
amd::Context& GlbCtx() const { return *glb_ctx_; }
|
|
|
|
//! Lock protect P2P staging operations
|
|
Monitor& P2PStageOps() const { return p2p_stage_ops_; }
|
|
|
|
//! Staging buffer for P2P transfer
|
|
Memory* P2PStage() const { return p2p_stage_; }
|
|
|
|
//! Returns heap buffer object for device allocator
|
|
device::Memory* HeapBuffer() const { return heap_buffer_; }
|
|
|
|
//! Returns stack size set for the device
|
|
uint64_t StackSize() const { return stack_size_; }
|
|
|
|
//! Sets the stack size of the device
|
|
bool UpdateStackSize(uint64_t stackSize);
|
|
|
|
//! Returns initial heap size
|
|
uint64_t InitialHeapSize() const { return initial_heap_size_; }
|
|
|
|
//! Sets the heap size of the device
|
|
bool UpdateInitialHeapSize(uint64_t initialHeapSize);
|
|
|
|
//! Does this device allow P2P access?
|
|
bool P2PAccessAllowed() const { return (p2p_access_devices_.size() > 0) ? true : false; }
|
|
|
|
//! Returns the list of devices that can have access to the current
|
|
const std::vector<Device*>& P2PAccessDevices() const { return p2p_access_devices_; }
|
|
|
|
//! Returns index of current device
|
|
uint32_t index() const { return index_; }
|
|
|
|
//! Returns value for LinkAttribute for lost of vectors
|
|
virtual bool findLinkInfo(const amd::Device& other_device,
|
|
std::vector<LinkAttrType>* link_attr) {
|
|
return false;
|
|
}
|
|
|
|
// Notifies device about context destroy
|
|
virtual void ContextDestroy() {}
|
|
|
|
//! Returns active wait state for this device
|
|
bool ActiveWait() const { return activeWait_; }
|
|
|
|
void SetActiveWait(bool state) { activeWait_ = state; }
|
|
|
|
virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
#if defined(__clang__)
|
|
#if __has_feature(address_sanitizer)
|
|
virtual device::UriLocator* createUriLocator() const = 0;
|
|
#endif
|
|
#endif
|
|
protected:
|
|
//! Enable the specified extension
|
|
char* getExtensionString();
|
|
|
|
device::Info info_; //!< Device info structure
|
|
device::Settings* settings_; //!< Device settings
|
|
union {
|
|
struct {
|
|
uint32_t online_: 1; //!< The device in online
|
|
uint32_t activeWait_: 1; //!< If true device requires active wait
|
|
};
|
|
uint32_t state_; //!< State bit mask
|
|
};
|
|
|
|
BlitProgram* blitProgram_; //!< Blit program info
|
|
static AppProfile appProfile_; //!< application profile
|
|
amd::Context* context_; //!< Context
|
|
|
|
static amd::Context* glb_ctx_; //!< Global context with all devices
|
|
static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources
|
|
static Memory* p2p_stage_; //!< Staging resources
|
|
std::vector<Device*> enabled_p2p_devices_; //!< List of user enabled P2P devices for this device
|
|
|
|
std::once_flag heap_initialized_; //!< Heap buffer initialization flag
|
|
device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device
|
|
|
|
amd::Memory* arena_mem_obj_; //!< Arena memory object
|
|
uint64_t stack_size_{1024}; //!< Device stack size
|
|
device::Memory* initial_heap_buffer_; //!< Initial heap buffer
|
|
uint64_t initial_heap_size_{HIP_INITIAL_DM_SIZE}; //!< Initial device heap size
|
|
private:
|
|
const Isa *isa_; //!< Device isa
|
|
bool IsTypeMatching(cl_device_type type, bool offlineDevices);
|
|
|
|
#if defined(WITH_HSA_DEVICE)
|
|
static AppProfile* rocAppProfile_;
|
|
#endif
|
|
|
|
static std::vector<Device*>* devices_; //!< All known devices
|
|
static amd::Monitor lockP2P_;
|
|
Monitor* vaCacheAccess_; //!< Lock to serialize VA caching access
|
|
std::map<uintptr_t, device::Memory*>* vaCacheMap_; //!< VA cache map
|
|
uint32_t index_; //!< Unique device index
|
|
};
|
|
|
|
/*! @}
|
|
* @}
|
|
*/
|
|
|
|
} // namespace amd
|
|
|
|
#endif /*DEVICE_HPP_*/
|