e51a27e182
SWDEV-182054 - Allow building OpenCL-Runtime with COMGR enabled without OpenCL-Driver being present Affected files ... ... //depot/stg/opencl/drivers/opencl/api/hip/build/Makefile.hip#12 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/build/Makefile.api#184 edit ... //depot/stg/opencl/drivers/opencl/compiler/Makefile#71 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#242 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#335 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#30 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.hpp#18 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#126 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#87 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#120 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#51 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#100 edit
1423 rindas
48 KiB
C++
1423 rindas
48 KiB
C++
//
|
|
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#ifndef DEVICE_HPP_
|
|
#define DEVICE_HPP_
|
|
|
|
#include "top.hpp"
|
|
#include "thread/thread.hpp"
|
|
#include "thread/monitor.hpp"
|
|
#include "platform/context.hpp"
|
|
#include "platform/object.hpp"
|
|
#include "platform/memory.hpp"
|
|
#include "utils/util.hpp"
|
|
#include "amdocl/cl_kernel.h"
|
|
#include "elf/elf.hpp"
|
|
#include "appprofile.hpp"
|
|
#include "devprogram.hpp"
|
|
#include "devkernel.hpp"
|
|
#include "amdocl/cl_profile_amd.h"
|
|
|
|
#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
|
#include "caching/cache.hpp"
|
|
#include "driver/AmdCompiler.h"
|
|
#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
|
#include "acl.h"
|
|
|
|
#include "hwdebug.hpp"
|
|
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <map>
|
|
#include <list>
|
|
#include <set>
|
|
#include <utility>
|
|
|
|
namespace amd {
|
|
class Command;
|
|
class CommandQueue;
|
|
class ReadMemoryCommand;
|
|
class WriteMemoryCommand;
|
|
class FillMemoryCommand;
|
|
class CopyMemoryCommand;
|
|
class CopyMemoryP2PCommand;
|
|
class MapMemoryCommand;
|
|
class UnmapMemoryCommand;
|
|
class MigrateMemObjectsCommand;
|
|
class NDRangeKernelCommand;
|
|
class NativeFnCommand;
|
|
class FlushCommand;
|
|
class FinishCommand;
|
|
class AcquireExtObjectsCommand;
|
|
class ReleaseExtObjectsCommand;
|
|
class PerfCounterCommand;
|
|
class ReleaseObjectCommand;
|
|
class StallQueueCommand;
|
|
class Marker;
|
|
class ThreadTraceCommand;
|
|
class ThreadTraceMemObjectsCommand;
|
|
class SignalCommand;
|
|
class MakeBuffersResidentCommand;
|
|
class SvmFreeMemoryCommand;
|
|
class SvmCopyMemoryCommand;
|
|
class SvmFillMemoryCommand;
|
|
class SvmMapMemoryCommand;
|
|
class SvmUnmapMemoryCommand;
|
|
class TransferBufferFileCommand;
|
|
class HwDebugManager;
|
|
class Device;
|
|
#ifndef USE_COMGR_LIBRARY
|
|
class CacheCompilation;
|
|
#endif
|
|
struct KernelParameterDescriptor;
|
|
struct Coord3D;
|
|
|
|
namespace option {
|
|
class Options;
|
|
} // option
|
|
|
|
}
|
|
|
|
enum OclExtensions {
|
|
ClKhrFp64 = 0,
|
|
ClAmdFp64,
|
|
ClKhrSelectFpRoundingMode,
|
|
ClKhrGlobalInt32BaseAtomics,
|
|
ClKhrGlobalInt32ExtendedAtomics,
|
|
ClKhrLocalInt32BaseAtomics,
|
|
ClKhrLocalInt32ExtendedAtomics,
|
|
ClKhrInt64BaseAtomics,
|
|
ClKhrInt64ExtendedAtomics,
|
|
ClKhr3DImageWrites,
|
|
ClKhrByteAddressableStore,
|
|
ClKhrFp16,
|
|
ClKhrGlSharing,
|
|
ClKhrGLDepthImages,
|
|
ClExtDeviceFission,
|
|
ClAmdDeviceAttributeQuery,
|
|
ClAmdVec3,
|
|
ClAmdPrintf,
|
|
ClAmdMediaOps,
|
|
ClAmdMediaOps2,
|
|
ClAmdPopcnt,
|
|
#if defined(_WIN32)
|
|
ClKhrD3d10Sharing,
|
|
ClKhrD3d11Sharing,
|
|
ClKhrD3d9Sharing,
|
|
#endif
|
|
ClKhrImage2dFromBuffer,
|
|
ClAmdSemaphore,
|
|
ClAMDBusAddressableMemory,
|
|
ClAMDC11Atomics,
|
|
ClKhrSpir,
|
|
ClKhrSubGroups,
|
|
ClKhrGlEvent,
|
|
ClKhrDepthImages,
|
|
ClKhrMipMapImage,
|
|
ClKhrMipMapImageWrites,
|
|
ClKhrIlProgram,
|
|
ClAMDLiquidFlash,
|
|
ClAmdCopyBufferP2P,
|
|
ClAmdAssemblyProgram,
|
|
#if defined(_WIN32)
|
|
ClAmdPlanarYuv,
|
|
#endif
|
|
ClExtTotal
|
|
};
|
|
|
|
static const char* OclExtensionsString[] = {"cl_khr_fp64 ",
|
|
"cl_amd_fp64 ",
|
|
"cl_khr_select_fprounding_mode ",
|
|
"cl_khr_global_int32_base_atomics ",
|
|
"cl_khr_global_int32_extended_atomics ",
|
|
"cl_khr_local_int32_base_atomics ",
|
|
"cl_khr_local_int32_extended_atomics ",
|
|
"cl_khr_int64_base_atomics ",
|
|
"cl_khr_int64_extended_atomics ",
|
|
"cl_khr_3d_image_writes ",
|
|
"cl_khr_byte_addressable_store ",
|
|
"cl_khr_fp16 ",
|
|
"cl_khr_gl_sharing ",
|
|
"cl_khr_gl_depth_images ",
|
|
"cl_ext_device_fission ",
|
|
"cl_amd_device_attribute_query ",
|
|
"cl_amd_vec3 ",
|
|
"cl_amd_printf ",
|
|
"cl_amd_media_ops ",
|
|
"cl_amd_media_ops2 ",
|
|
"cl_amd_popcnt ",
|
|
#if defined(_WIN32)
|
|
"cl_khr_d3d10_sharing ",
|
|
"cl_khr_d3d11_sharing ",
|
|
"cl_khr_dx9_media_sharing ",
|
|
#endif
|
|
"cl_khr_image2d_from_buffer ",
|
|
"",
|
|
"cl_amd_bus_addressable_memory ",
|
|
"cl_amd_c11_atomics ",
|
|
"cl_khr_spir ",
|
|
"cl_khr_subgroups ",
|
|
"cl_khr_gl_event ",
|
|
"cl_khr_depth_images ",
|
|
"cl_khr_mipmap_image ",
|
|
"cl_khr_mipmap_image_writes ",
|
|
"",
|
|
"cl_amd_liquid_flash ",
|
|
"cl_amd_copy_buffer_p2p ",
|
|
"cl_amd_assembly_program ",
|
|
#if defined(_WIN32)
|
|
"cl_amd_planar_yuv",
|
|
#endif
|
|
NULL};
|
|
|
|
static constexpr int AmdVendor = 0x1002;
|
|
|
|
namespace device {
|
|
class ClBinary;
|
|
class BlitManager;
|
|
class Program;
|
|
class Kernel;
|
|
|
|
//! Physical device properties.
|
|
struct Info : public amd::EmbeddedObject {
|
|
//! The OpenCL device type.
|
|
cl_device_type type_;
|
|
|
|
//! A unique device vendor identifier.
|
|
cl_uint vendorId_;
|
|
|
|
//! The number of parallel compute cores on the compute device.
|
|
cl_uint maxComputeUnits_;
|
|
|
|
//! Maximum dimensions that specify the global and local work-item IDs
|
|
// used by the data-parallel execution model.
|
|
cl_uint maxWorkItemDimensions_;
|
|
|
|
//! Maximum number of work-items that can be specified in each dimension
|
|
// to clEnqueueNDRangeKernel.
|
|
size_t maxWorkItemSizes_[3];
|
|
|
|
//! Maximum number of work-items in a work-group executing a kernel
|
|
// using the data-parallel execution model.
|
|
size_t maxWorkGroupSize_;
|
|
|
|
//! Preferred number of work-items in a work-group executing a kernel
|
|
// using the data-parallel execution model.
|
|
size_t preferredWorkGroupSize_;
|
|
|
|
//! Number of shader engines in physical GPU
|
|
size_t numberOfShaderEngines;
|
|
|
|
//! cl_uint Preferred native vector width size for built-in scalar types
|
|
// that can be put into vectors.
|
|
cl_uint preferredVectorWidthChar_;
|
|
cl_uint preferredVectorWidthShort_;
|
|
cl_uint preferredVectorWidthInt_;
|
|
cl_uint preferredVectorWidthLong_;
|
|
cl_uint preferredVectorWidthFloat_;
|
|
cl_uint preferredVectorWidthDouble_;
|
|
cl_uint preferredVectorWidthHalf_;
|
|
|
|
//! Returns the native ISA vector width. The vector width is defined as the
|
|
// number of scalar elements that can be stored in the vector.
|
|
cl_uint nativeVectorWidthChar_;
|
|
cl_uint nativeVectorWidthShort_;
|
|
cl_uint nativeVectorWidthInt_;
|
|
cl_uint nativeVectorWidthLong_;
|
|
cl_uint nativeVectorWidthFloat_;
|
|
cl_uint nativeVectorWidthDouble_;
|
|
cl_uint nativeVectorWidthHalf_;
|
|
|
|
//! Maximum configured engine clock frequency of the device in MHz.
|
|
cl_uint maxEngineClockFrequency_;
|
|
|
|
//! Maximum configured memory clock frequency of the device in MHz.
|
|
cl_uint maxMemoryClockFrequency_;
|
|
|
|
//! Memory bus width in bits.
|
|
cl_uint vramBusBitWidth_;
|
|
|
|
//! Size of L2 Cache in bytes.
|
|
cl_uint l2CacheSize_;
|
|
|
|
//! Timestamp frequency in Hz.
|
|
cl_uint timeStampFrequency_;
|
|
|
|
//! Describes the address spaces supported by the device.
|
|
cl_uint addressBits_;
|
|
|
|
//! Max number of simultaneous image objects that can be read by a
|
|
// kernel.
|
|
cl_uint maxReadImageArgs_;
|
|
|
|
//! Max number of simultaneous image objects that can be written to
|
|
// by a kernel.
|
|
cl_uint maxWriteImageArgs_;
|
|
|
|
//! Max number of simultaneous image objects that can be read/written to
|
|
// by a kernel.
|
|
cl_uint maxReadWriteImageArgs_;
|
|
|
|
//! Max size of memory object allocation in bytes.
|
|
cl_ulong maxMemAllocSize_;
|
|
|
|
//! Max width of 2D image in pixels.
|
|
size_t image2DMaxWidth_;
|
|
|
|
//! Max height of 2D image in pixels.
|
|
size_t image2DMaxHeight_;
|
|
|
|
//! Max width of 3D image in pixels.
|
|
size_t image3DMaxWidth_;
|
|
|
|
//! Max height of 3D image in pixels.
|
|
size_t image3DMaxHeight_;
|
|
|
|
//! Max depth of 3D image in pixels.
|
|
size_t image3DMaxDepth_;
|
|
|
|
//! Describes whether images are supported
|
|
cl_bool imageSupport_;
|
|
|
|
//! Max size in bytes of the arguments that can be passed to a kernel.
|
|
size_t maxParameterSize_;
|
|
|
|
//! Maximum number of samplers that can be used in a kernel.
|
|
cl_uint maxSamplers_;
|
|
|
|
//! Describes the alignment in bits of the base address of any
|
|
// allocated memory object.
|
|
cl_uint memBaseAddrAlign_;
|
|
|
|
//! The smallest alignment in bytes which can be used for any data type.
|
|
cl_uint minDataTypeAlignSize_;
|
|
|
|
//! Describes single precision floating point capability of the device.
|
|
cl_device_fp_config halfFPConfig_;
|
|
cl_device_fp_config singleFPConfig_;
|
|
cl_device_fp_config doubleFPConfig_;
|
|
|
|
//! Type of global memory cache supported.
|
|
cl_device_mem_cache_type globalMemCacheType_;
|
|
|
|
//! Size of global memory cache line in bytes.
|
|
cl_uint globalMemCacheLineSize_;
|
|
|
|
//! Size of global memory cache in bytes.
|
|
cl_ulong globalMemCacheSize_;
|
|
|
|
//! Size of global device memory in bytes.
|
|
cl_ulong globalMemSize_;
|
|
|
|
//! Max size in bytes of a constant buffer allocation.
|
|
cl_ulong maxConstantBufferSize_;
|
|
|
|
//! Preferred size in bytes of a constant buffer allocation.
|
|
cl_ulong preferredConstantBufferSize_;
|
|
|
|
//! Max number of arguments declared
|
|
cl_uint maxConstantArgs_;
|
|
|
|
//! This is used to determine the type of local memory that is available
|
|
cl_device_local_mem_type localMemType_;
|
|
|
|
//! Size of local memory arena in bytes.
|
|
cl_ulong localMemSize_;
|
|
|
|
//! If enabled, implies that all the memories, caches, registers etc. in
|
|
// the device implement error correction.
|
|
cl_bool errorCorrectionSupport_;
|
|
|
|
//! CL_TRUE if the device and the host have a unified memory subsystem and
|
|
// is CL_FALSE otherwise.
|
|
cl_bool hostUnifiedMemory_;
|
|
|
|
//! Describes the resolution of device timer.
|
|
size_t profilingTimerResolution_;
|
|
|
|
//! Timer starting point offset to Epoch.
|
|
cl_ulong profilingTimerOffset_;
|
|
|
|
//! CL_TRUE if device is a little endian device.
|
|
cl_bool littleEndian_;
|
|
|
|
//! If enabled, implies that commands can be submitted to command-queues
|
|
// created on this device.
|
|
cl_bool available_;
|
|
|
|
//! if the implementation does not have a compiler available to compile
|
|
// the program source.
|
|
cl_bool compilerAvailable_;
|
|
|
|
//! Describes the execution capabilities of the device.
|
|
cl_device_exec_capabilities executionCapabilities_;
|
|
|
|
//! Describes the SVM capabilities of the device.
|
|
cl_device_svm_capabilities svmCapabilities_;
|
|
|
|
//! Preferred alignment for OpenCL fine-grained SVM atomic types.
|
|
cl_uint preferredPlatformAtomicAlignment_;
|
|
|
|
//! Preferred alignment for OpenCL global atomic types.
|
|
cl_uint preferredGlobalAtomicAlignment_;
|
|
|
|
//! Preferred alignment for OpenCL local atomic types.
|
|
cl_uint preferredLocalAtomicAlignment_;
|
|
|
|
//! Describes the command-queue properties supported of the host queue.
|
|
cl_command_queue_properties queueProperties_;
|
|
|
|
//! The platform associated with this device
|
|
cl_platform_id platform_;
|
|
|
|
//! Device name string
|
|
char name_[0x40];
|
|
|
|
//! Vendor name string
|
|
char vendor_[0x20];
|
|
|
|
//! OpenCL software driver version string in the form major.minor
|
|
char driverVersion_[0x20];
|
|
|
|
//! Returns the profile name supported by the device.
|
|
const char* profile_;
|
|
|
|
//! Returns the OpenCL version supported by the device.
|
|
const char* version_;
|
|
|
|
//! The highest OpenCL C version supported by the compiler for this device.
|
|
const char* oclcVersion_;
|
|
|
|
//! Returns a space separated list of extension names.
|
|
const char* extensions_;
|
|
|
|
//! Returns if device linker is available
|
|
cl_bool linkerAvailable_;
|
|
|
|
//! Returns the list of built-in kernels, supported by the device
|
|
const char* builtInKernels_;
|
|
|
|
//! Returns max number of pixels for a 1D image created from a buffer object
|
|
size_t imageMaxBufferSize_;
|
|
|
|
//! Returns max number of images in a 1D or 2D image array
|
|
size_t imageMaxArraySize_;
|
|
|
|
//! Returns CL_TRUE if the devices preference is for the user to be
|
|
//! responsible for synchronization
|
|
cl_bool preferredInteropUserSync_;
|
|
|
|
//! Returns maximum size of the internal buffer that holds the output
|
|
//! of printf calls from a kernel
|
|
size_t printfBufferSize_;
|
|
|
|
//! Indicates maximum number of supported global atomic counters
|
|
cl_uint maxAtomicCounters_;
|
|
|
|
//! Returns the topology for the device
|
|
cl_device_topology_amd deviceTopology_;
|
|
|
|
//! Semaphore information
|
|
cl_uint maxSemaphores_;
|
|
cl_uint maxSemaphoreSize_;
|
|
|
|
//! Returns the SKU board name for the device
|
|
char boardName_[128];
|
|
|
|
//! Number of SIMD (Single Instruction Multiple Data) units per compute unit
|
|
//! that execute in parallel. All work items from the same work group must be
|
|
//! executed by SIMDs in the same compute unit.
|
|
cl_uint simdPerCU_;
|
|
cl_uint cuPerShaderArray_; //!< Number of CUs per shader array
|
|
//! The maximum number of work items from the same work group that can be
|
|
//! executed by a SIMD in parallel
|
|
cl_uint simdWidth_;
|
|
//! The number of instructions that a SIMD can execute in parallel
|
|
cl_uint simdInstructionWidth_;
|
|
//! The number of workitems per wavefront
|
|
cl_uint wavefrontWidth_;
|
|
//! Available number of SGPRs
|
|
cl_uint availableSGPRs_;
|
|
//! Number of global memory channels
|
|
cl_uint globalMemChannels_;
|
|
//! Number of banks in each global memory channel
|
|
cl_uint globalMemChannelBanks_;
|
|
//! Width in bytes of each of global memory bank
|
|
cl_uint globalMemChannelBankWidth_;
|
|
//! Local memory size per CU
|
|
cl_uint localMemSizePerCU_;
|
|
//! Number of banks of local memory
|
|
cl_uint localMemBanks_;
|
|
//! The core engine GFXIP version
|
|
cl_uint gfxipVersion_;
|
|
//! Number of available async queues
|
|
cl_uint numAsyncQueues_;
|
|
//! Number of available real time queues
|
|
cl_uint numRTQueues_;
|
|
//! Number of available real time compute units
|
|
cl_uint numRTCUs_;
|
|
//! Thread trace enable
|
|
cl_bool threadTraceEnable_;
|
|
//! ECC protected GPRs support (only available Vega20+)
|
|
cl_bool sramEccEnabled_;
|
|
|
|
//! Image pitch alignment for image2d_from_buffer
|
|
cl_uint imagePitchAlignment_;
|
|
//! Image base address alignment for image2d_from_buffer
|
|
cl_uint imageBaseAddressAlignment_;
|
|
|
|
//! Describes whether buffers from images are supported
|
|
cl_bool bufferFromImageSupport_;
|
|
|
|
//! Returns the supported SPIR versions for the device
|
|
const char* spirVersions_;
|
|
|
|
//! OpenCL20 device info fields:
|
|
|
|
//! The max number of pipe objects that can be passed as arguments to a kernel
|
|
cl_uint maxPipeArgs_;
|
|
//! The max number of reservations that can be active for a pipe per work-item in a kernel
|
|
cl_uint maxPipeActiveReservations_;
|
|
//! The max size of pipe packet in bytes
|
|
cl_uint maxPipePacketSize_;
|
|
|
|
//! The command-queue properties supported of the device queue.
|
|
cl_command_queue_properties queueOnDeviceProperties_;
|
|
//! The preferred size of the device queue in bytes
|
|
cl_uint queueOnDevicePreferredSize_;
|
|
//! The max size of the device queue in bytes
|
|
cl_uint queueOnDeviceMaxSize_;
|
|
//! The maximum number of device queues
|
|
cl_uint maxOnDeviceQueues_;
|
|
//! The maximum number of events in use on a device queue
|
|
cl_uint maxOnDeviceEvents_;
|
|
|
|
//! The maximum size of global scope variables
|
|
size_t maxGlobalVariableSize_;
|
|
size_t globalVariablePreferredTotalSize_;
|
|
//! Driver store location
|
|
char driverStore_[200];
|
|
//! Device ID
|
|
uint32_t pcieDeviceId_;
|
|
//! Revision ID
|
|
uint32_t pcieRevisionId_;
|
|
|
|
//! Max numbers of threads per CU
|
|
cl_uint maxThreadsPerCU_;
|
|
};
|
|
|
|
//! Device settings
|
|
class Settings : public amd::HeapObject {
|
|
public:
|
|
uint64_t extensions_; //!< Supported OCL extensions
|
|
union {
|
|
struct {
|
|
uint overrideLclSet : 3; //!< Bit mask to override the local size
|
|
uint apuSystem_ : 1; //!< Device is APU system with shared memory
|
|
uint supportRA_ : 1; //!< Support RA channel order format
|
|
uint waitCommand_ : 1; //!< Enables a wait for every submitted command
|
|
uint customHostAllocator_ : 1; //!< True if device has custom host allocator
|
|
// that replaces generic OS allocation routines
|
|
uint supportDepthsRGB_ : 1; //!< Support DEPTH and sRGB channel order format
|
|
uint enableHwDebug_ : 1; //!< Enable HW debug support
|
|
uint reportFMAF_ : 1; //!< Report FP_FAST_FMAF define in CL program
|
|
uint reportFMA_ : 1; //!< Report FP_FAST_FMA define in CL program
|
|
uint singleFpDenorm_ : 1; //!< Support Single FP Denorm
|
|
uint hsailExplicitXnack_ : 1; //!< Xnack in hsail path for this deivce
|
|
uint useLightning_ : 1; //!< Enable LC path for this device
|
|
uint enableWgpMode_ : 1; //!< Enable WGP mode for this device
|
|
uint enableWave32Mode_ : 1; //!< Enable Wave32 mode for this device
|
|
uint lcWavefrontSize64_ : 1; //!< Enable Wave64 mode for this device
|
|
uint enableXNACK_ : 1; //!< Enable XNACK feature
|
|
uint reserved_ : 14;
|
|
};
|
|
uint value_;
|
|
};
|
|
|
|
uint commandQueues_; //!< Field value for maximum number
|
|
//!< concurrent Virtual GPUs for each backend
|
|
|
|
//! Default constructor
|
|
Settings();
|
|
|
|
//! Check the specified extension
|
|
bool checkExtension(uint name) const {
|
|
return (extensions_ & (static_cast<uint64_t>(1) << name)) ? true : false;
|
|
}
|
|
|
|
//! Enable the specified extension
|
|
void enableExtension(uint name) { extensions_ |= static_cast<uint64_t>(1) << name; }
|
|
|
|
private:
|
|
//! Disable copy constructor
|
|
Settings(const Settings&);
|
|
|
|
//! Disable assignment
|
|
Settings& operator=(const Settings&);
|
|
};
|
|
|
|
//! Device-independent cache memory, base class for the device-specific
|
|
//! memories. One Memory instance refers to one or more of these.
|
|
class Memory : public amd::HeapObject {
|
|
public:
|
|
//! Resource map flags
|
|
enum CpuMapFlags {
|
|
CpuReadWrite = 0x00000000, //!< Lock for CPU read/Write
|
|
CpuReadOnly = 0x00000001, //!< Lock for CPU read only operation
|
|
CpuWriteOnly = 0x00000002, //!< Lock for CPU write only operation
|
|
};
|
|
|
|
union SyncFlags {
|
|
struct {
|
|
uint skipParent_ : 1; //!< Skip parent synchronization
|
|
uint skipViews_ : 1; //!< Skip views synchronization
|
|
uint skipEntire_ : 1; //!< Skip entire synchronization
|
|
};
|
|
uint value_;
|
|
SyncFlags() : value_(0) {}
|
|
};
|
|
|
|
struct WriteMapInfo : public amd::HeapObject {
|
|
amd::Coord3D origin_; //!< Origin of the map location
|
|
amd::Coord3D region_; //!< Mapped region
|
|
amd::Image* baseMip_; //!< The base mip level for images
|
|
union {
|
|
struct {
|
|
uint32_t count_ : 8; //!< The same map region counter
|
|
uint32_t unmapWrite_ : 1; //!< Unmap write operation
|
|
uint32_t unmapRead_ : 1; //!< Unmap read operation
|
|
uint32_t entire_ : 1; //!< Process the entire memory
|
|
};
|
|
uint32_t flags_;
|
|
};
|
|
|
|
//! Returns the state of entire map
|
|
bool isEntire() const { return (entire_) ? true : false; }
|
|
|
|
//! Returns the state of map write flag
|
|
bool isUnmapWrite() const { return (unmapWrite_) ? true : false; }
|
|
|
|
//! Returns the state of map read flag
|
|
bool isUnmapRead() const { return (unmapRead_) ? true : false; }
|
|
|
|
WriteMapInfo() : origin_(0, 0, 0), region_(0, 0, 0), baseMip_(NULL), flags_(0) {}
|
|
};
|
|
|
|
//! Constructor (from an amd::Memory object).
|
|
Memory(amd::Memory& owner)
|
|
: flags_(0), owner_(&owner), version_(0), mapMemory_(NULL), indirectMapCount_(0) {
|
|
size_ = owner.getSize();
|
|
}
|
|
|
|
//! Constructor (no owner), always eager allocation.
|
|
Memory(size_t size)
|
|
: flags_(0), owner_(NULL), version_(0), mapMemory_(NULL), indirectMapCount_(0), size_(size) {}
|
|
|
|
enum GLResourceOP {
|
|
GLDecompressResource = 0, // orders the GL driver to decompress any depth-stencil or MSAA
|
|
// resource to be sampled by a CL kernel.
|
|
GLInvalidateFBO // orders the GL driver to invalidate any FBO the resource may be bound to,
|
|
// since the resource internal state changed.
|
|
};
|
|
|
|
//! Default destructor for the device memory object
|
|
virtual ~Memory(){};
|
|
|
|
//! Releases virtual objects associated with this memory
|
|
void releaseVirtual();
|
|
|
|
//! Read the size
|
|
size_t size() const { return size_; }
|
|
|
|
//! Gets the owner Memory instance
|
|
amd::Memory* owner() const { return owner_; }
|
|
|
|
//! Immediate blocking write from device cache to owners's backing store.
|
|
//! Marks owner as "current" by resetting the last writer to NULL.
|
|
virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags()) {}
|
|
|
|
//! Allocate memory for API-level maps
|
|
virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory
|
|
const amd::Coord3D& region, //!< The map region in memory
|
|
uint mapFlags, //!< Map flags
|
|
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
|
|
size_t* slicePitch = NULL //!< Slice for the mapped memory
|
|
) {
|
|
return NULL;
|
|
}
|
|
|
|
virtual bool pinSystemMemory(void* hostPtr, //!< System memory address
|
|
size_t size //!< Size of allocated system memory
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
//! Releases indirect map surface
|
|
virtual void releaseIndirectMap() {}
|
|
//! decompress any MSAA/depth-stencil interop surfaces.
|
|
//! notify GL to invalidate any surfaces touched by a CL kernel
|
|
virtual bool processGLResource(GLResourceOP operation) { return false; }
|
|
|
|
//! Map the device memory to CPU visible
|
|
virtual void* cpuMap(VirtualDevice& vDev, //!< Virtual device for map operaiton
|
|
uint flags = 0, //!< flags for the map operation
|
|
// Optimization for multilayer map/unmap
|
|
uint startLayer = 0, //!< Start layer for multilayer map
|
|
uint numLayers = 0, //!< End layer for multilayer map
|
|
size_t* rowPitch = NULL, //!< Row pitch for the device memory
|
|
size_t* slicePitch = NULL //!< Slice pitch for the device memory
|
|
) {
|
|
amd::Image* image = owner()->asImage();
|
|
if (image != NULL) {
|
|
*rowPitch = image->getRowPitch();
|
|
*slicePitch = image->getSlicePitch();
|
|
}
|
|
// Default behavior uses preallocated host mem for CPU
|
|
return owner()->getHostMem();
|
|
}
|
|
|
|
//! Unmap the device memory
|
|
virtual void cpuUnmap(VirtualDevice& vDev //!< Virtual device for unmap operaiton
|
|
) {}
|
|
|
|
//! Saves map info for this object
|
|
//! @note: It's not a thread safe operation, the app must implement
|
|
//! synchronization for the multiple write maps if necessary
|
|
void saveMapInfo(const void* mapAddress, //!< Map cpu address
|
|
const amd::Coord3D origin, //!< Origin of the map location
|
|
const amd::Coord3D region, //!< Mapped region
|
|
uint mapFlags, //!< Map flags
|
|
bool entire, //!< True if the enitre memory was mapped
|
|
amd::Image* baseMip = nullptr //!< The base mip level for map
|
|
);
|
|
|
|
const WriteMapInfo* writeMapInfo(const void* mapAddress) const {
|
|
// Unmap must be serialized.
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
|
|
auto it = writeMapInfo_.find(mapAddress);
|
|
if (it == writeMapInfo_.end()) {
|
|
if (writeMapInfo_.size() == 0) {
|
|
LogError("Unmap is a NOP!");
|
|
return nullptr;
|
|
}
|
|
LogWarning("Unknown unmap signature!");
|
|
// Get the first map info
|
|
it = writeMapInfo_.begin();
|
|
}
|
|
return &it->second;
|
|
}
|
|
|
|
//! Clear memory object as mapped read only
|
|
void clearUnmapInfo(const void* mapAddress) {
|
|
// Unmap must be serialized.
|
|
amd::ScopedLock lock(owner()->lockMemoryOps());
|
|
auto it = writeMapInfo_.find(mapAddress);
|
|
if (it == writeMapInfo_.end()) {
|
|
// Get the first map info
|
|
it = writeMapInfo_.begin();
|
|
}
|
|
if (--it->second.count_ == 0) {
|
|
writeMapInfo_.erase(it);
|
|
}
|
|
}
|
|
|
|
//! Returns the state of memory direct access flag
|
|
bool isHostMemDirectAccess() const { return (flags_ & HostMemoryDirectAccess) ? true : false; }
|
|
|
|
//! Returns the state of host memory registration flag
|
|
bool isHostMemoryRegistered() const { return (flags_ & HostMemoryRegistered) ? true : false; }
|
|
|
|
//! Returns the state of CPU uncached access
|
|
bool isCpuUncached() const { return (flags_ & MemoryCpuUncached) ? true : false; }
|
|
|
|
virtual uint64_t virtualAddress() const { return 0; }
|
|
|
|
//! Returns CPU pointer to HW state
|
|
virtual const address cpuSrd() const { return nullptr; }
|
|
|
|
virtual void IpcCreate(size_t offset, size_t* mem_size, void* handle) const {
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
protected:
|
|
enum Flags {
|
|
HostMemoryDirectAccess = 0x00000001, //!< GPU has direct access to the host memory
|
|
MapResourceAlloced = 0x00000002, //!< Map resource was allocated
|
|
PinnedMemoryAlloced = 0x00000004, //!< An extra pinned resource was allocated
|
|
SubMemoryObject = 0x00000008, //!< Memory is sub-memory
|
|
HostMemoryRegistered = 0x00000010, //!< Host memory was registered
|
|
MemoryCpuUncached = 0x00000020 //!< Memory is uncached on CPU access(slow read)
|
|
};
|
|
uint flags_; //!< Memory object flags
|
|
|
|
amd::Memory* owner_; //!< The Memory instance that we cache,
|
|
//!< or NULL if we're device-private workspace.
|
|
|
|
volatile size_t version_; //!< The version we're currently shadowing
|
|
|
|
//! NB, the map data below is for an API-level map (from clEnqueueMapBuffer),
|
|
//! not a physical map. When a memory object does not use USE_HOST_PTR we
|
|
//! can use a remote resource and DMA, avoiding the additional CPU memcpy.
|
|
amd::Memory* mapMemory_; //!< Memory used as map target buffer
|
|
volatile size_t indirectMapCount_; //!< Number of maps
|
|
std::unordered_map<const void*, WriteMapInfo> writeMapInfo_; //!< Saved write map info for partial unmap
|
|
|
|
//! Increment map count
|
|
void incIndMapCount() { ++indirectMapCount_; }
|
|
|
|
//! Decrement map count
|
|
virtual void decIndMapCount() {}
|
|
|
|
size_t size_; //!< Memory size
|
|
|
|
private:
|
|
//! Disable default copy constructor
|
|
Memory& operator=(const Memory&) = delete;
|
|
|
|
//! Disable operator=
|
|
Memory(const Memory&) = delete;
|
|
};
|
|
|
|
class Sampler : public amd::HeapObject {
|
|
public:
|
|
//! Constructor
|
|
Sampler() : hwSrd_(0), hwState_(nullptr) {}
|
|
|
|
//! Default destructor for the device memory object
|
|
virtual ~Sampler(){};
|
|
|
|
//! Returns device specific HW state for the sampler
|
|
uint64_t hwSrd() const { return hwSrd_; }
|
|
|
|
//! Returns CPU pointer to HW state
|
|
const address hwState() const { return hwState_; }
|
|
|
|
protected:
|
|
uint64_t hwSrd_; //!< Device specific HW state for the sampler
|
|
address hwState_; //!< CPU pointer to HW state
|
|
|
|
private:
|
|
//! Disable default copy constructor
|
|
Sampler& operator=(const Sampler&);
|
|
|
|
//! Disable operator=
|
|
Sampler(const Sampler&);
|
|
};
|
|
|
|
class ClBinary : public amd::HeapObject {
|
|
public:
|
|
enum BinaryImageFormat {
|
|
BIF_VERSION2 = 0, //!< Binary Image Format version 2.0 (ELF)
|
|
BIF_VERSION3 //!< Binary Image Format version 3.0 (ELF)
|
|
};
|
|
|
|
//! Constructor
|
|
ClBinary(const amd::Device& dev, BinaryImageFormat bifVer = BIF_VERSION3);
|
|
|
|
//! Destructor
|
|
virtual ~ClBinary();
|
|
|
|
void init(amd::option::Options* optionsObj, bool amdilRequired = false);
|
|
|
|
/** called only in loading image routines,
|
|
never called in storing routines */
|
|
bool setBinary(const char* theBinary, size_t theBinarySize, bool allocated = false);
|
|
|
|
//! setin elfIn_
|
|
bool setElfIn();
|
|
void resetElfIn();
|
|
|
|
//! set out elf
|
|
bool setElfOut(unsigned char eclass, const char* outFile);
|
|
void resetElfOut();
|
|
|
|
//! Set elf header information
|
|
virtual bool setElfTarget();
|
|
|
|
// class used in for loading images in new format
|
|
amd::OclElf* elfIn() { return elfIn_; }
|
|
|
|
// classes used storing and loading images in new format
|
|
amd::OclElf* elfOut() { return elfOut_; }
|
|
void elfOut(amd::OclElf* v) { elfOut_ = v; }
|
|
|
|
//! Create and save ELF binary image
|
|
bool createElfBinary(bool doencrypt, Program::type_t type);
|
|
|
|
// save BIF binary image
|
|
void saveBIFBinary(const char* binaryIn, size_t size);
|
|
|
|
bool decryptElf(const char* binaryIn, size_t size, char** decryptBin, size_t* decryptSize,
|
|
int* encryptCode);
|
|
|
|
//! Returns the binary pair for the abstraction layer
|
|
Program::binary_t data() const;
|
|
|
|
//! Loads llvmir binary from OCL binary file
|
|
bool loadLlvmBinary(
|
|
std::string& llvmBinary, //!< LLVMIR binary code
|
|
amd::OclElf::oclElfSections& elfSectionType //!< LLVMIR binary is in SPIR format
|
|
) const;
|
|
|
|
//! Loads compile options from OCL binary file
|
|
bool loadCompileOptions(std::string& compileOptions //!< return the compile options loaded
|
|
) const;
|
|
|
|
//! Loads link options from OCL binary file
|
|
bool loadLinkOptions(std::string& linkOptions //!< return the link options loaded
|
|
) const;
|
|
|
|
//! Store compile options into OCL binary file
|
|
void storeCompileOptions(const std::string& compileOptions //!< the compile options to be stored
|
|
);
|
|
|
|
//! Store link options into OCL binary file
|
|
void storeLinkOptions(const std::string& linkOptions //!< the link options to be stored
|
|
);
|
|
|
|
//! Check if the binary is recompilable
|
|
bool isRecompilable(std::string& llvmBinary, amd::OclElf::oclElfPlatform thePlatform);
|
|
|
|
void saveOrigBinary(const char* origBinary, size_t origSize) {
|
|
origBinary_ = origBinary;
|
|
origSize_ = origSize;
|
|
}
|
|
|
|
void restoreOrigBinary() {
|
|
if (origBinary_ != NULL) {
|
|
(void)setBinary(origBinary_, origSize_, false);
|
|
}
|
|
}
|
|
|
|
//! Set Binary flags
|
|
void setFlags(int encryptCode);
|
|
|
|
bool saveSOURCE() { return ((flags_ & BinarySourceMask) == BinarySaveSource); }
|
|
bool saveLLVMIR() { return ((flags_ & BinaryLlvmirMask) == BinarySaveLlvmir); }
|
|
bool saveAMDIL() { return ((flags_ & BinaryAmdilMask) == BinarySaveAmdil); }
|
|
bool saveISA() { return ((flags_ & BinaryIsaMask) == BinarySaveIsa); }
|
|
|
|
bool saveAS() { return ((flags_ & BinaryASMask) == BinarySaveAS); }
|
|
|
|
// Return the encrypt code for this input binary ( "> 0" means encrypted)
|
|
int getEncryptCode() { return encryptCode_; }
|
|
|
|
// Returns TRUE of binary file is SPIR
|
|
bool isSPIR() const;
|
|
// Returns TRUE of binary file is SPIRV
|
|
bool isSPIRV() const;
|
|
|
|
protected:
|
|
enum Flags {
|
|
BinaryAllocated = 0x1, //!< Binary was created
|
|
|
|
// Source control
|
|
BinaryNoSaveSource = 0x0, // 0: default
|
|
BinaryRemoveSource = 0x2, // for encrypted binary
|
|
BinarySaveSource = 0x4,
|
|
BinarySourceMask = 0x6,
|
|
|
|
// LLVMIR control
|
|
BinarySaveLlvmir = 0x0, // 0: default
|
|
BinaryRemoveLlvmir = 0x8, // for encrypted binary
|
|
BinaryNoSaveLlvmir = 0x10,
|
|
BinaryLlvmirMask = 0x18,
|
|
|
|
// AMDIL control
|
|
BinarySaveAmdil = 0x0, // 0: default
|
|
BinaryRemoveAmdil = 0x20, // for encrypted binary
|
|
BinaryNoSaveAmdil = 0x40,
|
|
BinaryAmdilMask = 0x60,
|
|
|
|
// ISA control
|
|
BinarySaveIsa = 0x0, // 0: default
|
|
BinaryRemoveIsa = 0x80, // for encrypted binary
|
|
BinaryNoSaveIsa = 0x100,
|
|
BinaryIsaMask = 0x180,
|
|
|
|
// AS control
|
|
BinaryNoSaveAS = 0x0, // 0: default
|
|
BinaryRemoveAS = 0x200, // for encrypted binary
|
|
BinarySaveAS = 0x400,
|
|
BinaryASMask = 0x600
|
|
};
|
|
|
|
//! Returns TRUE if binary file was allocated
|
|
bool isBinaryAllocated() const { return (flags_ & BinaryAllocated) ? true : false; }
|
|
|
|
//! Returns BIF symbol name by symbolID,
|
|
//! returns empty string if not found or if BIF version is unsupported
|
|
std::string getBIFSymbol(unsigned int symbolID) const;
|
|
|
|
protected:
|
|
const amd::Device& dev_; //!< Device object
|
|
|
|
private:
|
|
//! Disable default copy constructor
|
|
ClBinary(const ClBinary&);
|
|
|
|
//! Disable default operator=
|
|
ClBinary& operator=(const ClBinary&);
|
|
|
|
//! Releases the binary data store
|
|
void release();
|
|
|
|
const char* binary_; //!< binary data
|
|
size_t size_; //!< binary size
|
|
uint flags_; //!< CL binary object flags
|
|
|
|
const char* origBinary_; //!< original binary data
|
|
size_t origSize_; //!< original binary size
|
|
|
|
int encryptCode_; //!< Encryption Code for input binary (0 for not encrypted)
|
|
|
|
protected:
|
|
amd::OclElf* elfIn_; //!< ELF object for input ELF binary
|
|
amd::OclElf* elfOut_; //!< ELF object for output ELF binary
|
|
BinaryImageFormat format_; //!< which binary image format to use
|
|
};
|
|
|
|
inline const Program::binary_t Program::binary() const {
|
|
if (clBinary() == NULL) {
|
|
return {(const void*)0, 0};
|
|
}
|
|
return clBinary()->data();
|
|
}
|
|
|
|
inline Program::binary_t Program::binary() {
|
|
if (clBinary() == NULL) {
|
|
return {(const void*)0, 0};
|
|
}
|
|
return clBinary()->data();
|
|
}
|
|
|
|
/*! \class PerfCounter
|
|
*
|
|
* \brief The device interface class for the performance counters
|
|
*/
|
|
class PerfCounter : public amd::HeapObject {
|
|
public:
|
|
//! Constructor for the device performance
|
|
PerfCounter() {}
|
|
|
|
//! Get the performance counter info
|
|
virtual uint64_t getInfo(uint64_t infoType) const = 0;
|
|
|
|
//! Destructor for PerfCounter class
|
|
virtual ~PerfCounter() {}
|
|
|
|
private:
|
|
//! Disable default copy constructor
|
|
PerfCounter(const PerfCounter&);
|
|
|
|
//! Disable default operator=
|
|
PerfCounter& operator=(const PerfCounter&);
|
|
};
|
|
/*! \class ThreadTrace
|
|
*
|
|
* \brief The device interface class for the performance counters
|
|
*/
|
|
class ThreadTrace : public amd::HeapObject {
|
|
public:
|
|
//! Constructor for the device performance
|
|
ThreadTrace() {}
|
|
//! Update ThreadTrace status to true/false if new buffer was binded/unbinded respectively
|
|
virtual void setNewBufferBinded(bool) = 0;
|
|
//! Get the performance counter info
|
|
virtual bool info(uint infoType, uint* info, uint infoSize) const = 0;
|
|
//! Destructor for PerfCounter class
|
|
virtual ~ThreadTrace() {}
|
|
|
|
private:
|
|
//! Disable default copy constructor
|
|
ThreadTrace(const ThreadTrace&);
|
|
|
|
//! Disable default operator=
|
|
ThreadTrace& operator=(const ThreadTrace&);
|
|
};
|
|
|
|
//! A device execution environment.
|
|
class VirtualDevice : public amd::HeapObject {
|
|
public:
|
|
//! Construct a new virtual device for the given physical device.
|
|
VirtualDevice(amd::Device& device) : device_(device), blitMgr_(NULL) {}
|
|
|
|
//! Destroy this virtual device.
|
|
virtual ~VirtualDevice() {}
|
|
|
|
//! Prepare this virtual device for destruction.
|
|
virtual bool terminate() = 0;
|
|
|
|
//! Return the physical device for this virtual device.
|
|
const amd::Device& device() const { return device_(); }
|
|
|
|
virtual void submitReadMemory(amd::ReadMemoryCommand& cmd) = 0;
|
|
virtual void submitWriteMemory(amd::WriteMemoryCommand& cmd) = 0;
|
|
virtual void submitCopyMemory(amd::CopyMemoryCommand& cmd) = 0;
|
|
virtual void submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) = 0;
|
|
virtual void submitMapMemory(amd::MapMemoryCommand& cmd) = 0;
|
|
virtual void submitUnmapMemory(amd::UnmapMemoryCommand& cmd) = 0;
|
|
virtual void submitKernel(amd::NDRangeKernelCommand& command) = 0;
|
|
virtual void submitNativeFn(amd::NativeFnCommand& cmd) = 0;
|
|
virtual void submitMarker(amd::Marker& cmd) = 0;
|
|
virtual void submitFillMemory(amd::FillMemoryCommand& cmd) = 0;
|
|
virtual void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd) = 0;
|
|
virtual void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd) = 0;
|
|
virtual void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd) = 0;
|
|
virtual void submitPerfCounter(amd::PerfCounterCommand& cmd) = 0;
|
|
virtual void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) = 0;
|
|
virtual void submitThreadTrace(amd::ThreadTraceCommand& cmd) = 0;
|
|
virtual void flush(amd::Command* list = NULL, bool wait = false) = 0;
|
|
virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) = 0;
|
|
virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) = 0;
|
|
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) = 0;
|
|
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) = 0;
|
|
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) = 0;
|
|
/// Optional extensions
|
|
virtual void submitSignal(amd::SignalCommand& cmd) = 0;
|
|
virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) = 0;
|
|
virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
//! Get the blit manager object
|
|
device::BlitManager& blitMgr() const { return *blitMgr_; }
|
|
|
|
private:
|
|
//! Disable default copy constructor
|
|
VirtualDevice& operator=(const VirtualDevice&);
|
|
|
|
//! Disable operator=
|
|
VirtualDevice(const VirtualDevice&);
|
|
|
|
//! The physical device that this virtual device utilizes
|
|
amd::SharedReference<amd::Device> device_;
|
|
|
|
protected:
|
|
device::BlitManager* blitMgr_; //!< Blit manager
|
|
};
|
|
|
|
} // namespace device
|
|
|
|
namespace amd {
|
|
|
|
//! MemoryObject map lookup class
|
|
class MemObjMap : public AllStatic {
|
|
public:
|
|
static size_t size(); //!< obtain the size of the container
|
|
static void AddMemObj(const void* k,
|
|
amd::Memory* v); //!< add the host mem pointer and buffer in the container
|
|
static void RemoveMemObj(const void* k); //!< Remove an entry of mem object from the container
|
|
static amd::Memory* FindMemObj(
|
|
const void* k); //!< find the mem object based on the input pointer
|
|
private:
|
|
static std::map<uintptr_t, amd::Memory*> MemObjMap_; //!< the mem object<->hostptr information container
|
|
static amd::Monitor AllocatedLock_; //!< amd monitor locker
|
|
};
|
|
|
|
/*! \addtogroup Runtime
|
|
* @{
|
|
*
|
|
* \addtogroup Device Device Abstraction
|
|
* @{
|
|
*/
|
|
class Device : public RuntimeObject {
|
|
protected:
|
|
typedef aclCompiler Compiler;
|
|
|
|
public:
|
|
typedef std::list<CommandQueue*> CommandQueues;
|
|
|
|
struct BlitProgram : public amd::HeapObject {
|
|
Program* program_; //!< GPU program obejct
|
|
Context* context_; //!< A dummy context
|
|
|
|
BlitProgram(Context* context) : program_(NULL), context_(context) {}
|
|
~BlitProgram();
|
|
|
|
//! Creates blit program for this device
|
|
bool create(Device* device, //!< Device object
|
|
const char* extraKernel = NULL, //!< Extra kernels from the device layer
|
|
const char* extraOptions = NULL //!< Extra compilation options
|
|
);
|
|
};
|
|
|
|
virtual Compiler* compiler() const = 0;
|
|
virtual Compiler* binCompiler() const { return compiler(); }
|
|
|
|
Device();
|
|
virtual ~Device();
|
|
|
|
//! Initializes abstraction layer device object
|
|
bool create();
|
|
|
|
uint retain() {
|
|
// Overwrite the RuntimeObject::retain().
|
|
// There is an issue in the old SHOC11_DeviceMemory test on TC
|
|
return 0u;
|
|
}
|
|
|
|
uint release() {
|
|
// Overwrite the RuntimeObject::release().
|
|
// There is an issue in the old SHOC11_DeviceMemory test on TC
|
|
return 0u;
|
|
}
|
|
|
|
//! Register a device as available
|
|
void registerDevice();
|
|
|
|
//! Initialize the device layer (enumerate known devices)
|
|
static bool init();
|
|
|
|
//! Shutdown the device layer
|
|
static void tearDown();
|
|
|
|
static std::vector<Device*> getDevices(cl_device_type type, //!< Device type
|
|
bool offlineDevices //!< Enable offline devices
|
|
);
|
|
|
|
static size_t numDevices(cl_device_type type, //!< Device type
|
|
bool offlineDevices //!< Enable offline devices
|
|
);
|
|
|
|
static bool getDeviceIDs(cl_device_type deviceType, //!< Device type
|
|
cl_uint numEntries, //!< Number of entries in the array
|
|
cl_device_id* devices, //!< Array of the device ID(s)
|
|
cl_uint* numDevices, //!< Number of available devices
|
|
bool offlineDevices //!< Report offline devices
|
|
);
|
|
|
|
const device::Info& info() const { return info_; }
|
|
|
|
//! Return svm support capability.
|
|
bool svmSupport() const {
|
|
return (info().svmCapabilities_ &
|
|
(CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER |
|
|
CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) != 0
|
|
? true
|
|
: false;
|
|
}
|
|
|
|
//! check svm FGS support capability.
|
|
inline bool isFineGrainedSystem(bool FGSOPT = false) const {
|
|
return FGSOPT && (info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) != 0 ? true
|
|
: false;
|
|
}
|
|
|
|
//! Return this device's type.
|
|
cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); }
|
|
|
|
//! Create a new virtual device environment.
|
|
virtual device::VirtualDevice* createVirtualDevice(CommandQueue* queue = NULL) = 0;
|
|
|
|
//! Create a program for device.
|
|
virtual device::Program* createProgram(option::Options* options = NULL) = 0;
|
|
|
|
//! Allocate a chunk of device memory as a cache for a CL memory object
|
|
virtual device::Memory* createMemory(Memory& owner) const = 0;
|
|
|
|
//! Allocate a device sampler object
|
|
virtual bool createSampler(const Sampler&, device::Sampler**) const = 0;
|
|
|
|
//! Allocates a view object from the device memory
|
|
virtual device::Memory* createView(
|
|
amd::Memory& owner, //!< Owner memory object
|
|
const device::Memory& parent //!< Parent device memory object for the view
|
|
) const = 0;
|
|
|
|
//! Return true if initialized external API interop, otherwise false
|
|
virtual bool bindExternalDevice(
|
|
uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc.
|
|
void* const pDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL
|
|
void* pContext, //!< HGLRC/GLXContext handle
|
|
bool validateOnly //! Only validate if the device can inter-operate with pDevice/pContext, do
|
|
//! not bind.
|
|
) = 0;
|
|
|
|
virtual bool unbindExternalDevice(
|
|
uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc.
|
|
void* const pDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL
|
|
void* pContext, //!< HGLRC/GLXContext handle
|
|
bool validateOnly //! Only validate if the device can inter-operate with pDevice/pContext, do
|
|
//! not bind.
|
|
) = 0;
|
|
|
|
//! resolves GL depth/msaa buffer
|
|
virtual bool resolveGLMemory(device::Memory*) const { return true; }
|
|
|
|
//! Gets free memory on a GPU device
|
|
virtual bool globalFreeMemory(size_t* freeMemory //!< Free memory information on a GPU device
|
|
) const = 0;
|
|
|
|
/**
|
|
* @return True if the device has its own custom host allocator to be used
|
|
* instead of the generic OS allocation routines
|
|
*/
|
|
bool customHostAllocator() const { return settings().customHostAllocator_ == 1; }
|
|
|
|
/**
|
|
* @copydoc amd::Context::hostAlloc
|
|
*/
|
|
virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const {
|
|
ShouldNotCallThis();
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* @copydoc amd::Context::hostFree
|
|
*/
|
|
virtual void hostFree(void* ptr, size_t size = 0) const { ShouldNotCallThis(); }
|
|
|
|
/**
|
|
* @copydoc amd::Context::svmAlloc
|
|
*/
|
|
virtual void* svmAlloc(Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags,
|
|
void* svmPtr) const = 0;
|
|
/**
|
|
* @copydoc amd::Context::svmFree
|
|
*/
|
|
virtual void svmFree(void* ptr) const = 0;
|
|
|
|
//! Validate kernel
|
|
virtual bool validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev) {
|
|
return true;
|
|
};
|
|
|
|
virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
|
|
return true;
|
|
};
|
|
//! Returns TRUE if the device is available for computations
|
|
bool isOnline() const { return online_; }
|
|
|
|
//! Returns device settings
|
|
const device::Settings& settings() const { return *settings_; }
|
|
|
|
//! Returns blit program info structure
|
|
BlitProgram* blitProgram() const { return blitProgram_; }
|
|
|
|
//! RTTI internal implementation
|
|
virtual ObjectType objectType() const { return ObjectTypeDevice; }
|
|
|
|
//! Returns app profile
|
|
static const AppProfile* appProfile() { return &appProfile_; }
|
|
|
|
//! Register a hardware debugger manager
|
|
HwDebugManager* hwDebugMgr() const { return hwDebugMgr_; }
|
|
|
|
//! Initialize the Hardware Debug Manager
|
|
virtual cl_int hwDebugManagerInit(amd::Context* context, uintptr_t messageStorage) {
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
//! Remove the Hardware Debug Manager
|
|
virtual void hwDebugManagerRemove() {}
|
|
|
|
//! Adds GPU memory to the VA cache list
|
|
void addVACache(device::Memory* memory) const;
|
|
|
|
//! Removes GPU memory from the VA cache list
|
|
void removeVACache(const device::Memory* memory) const;
|
|
|
|
//! Finds GPU memory from virtual address
|
|
device::Memory* findMemoryFromVA(const void* ptr, size_t* offset) const;
|
|
|
|
static std::vector<Device*>& devices() { return *devices_; }
|
|
|
|
// P2P devices that are accessible from the current device
|
|
std::vector<cl_device_id> p2pDevices_;
|
|
|
|
#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
|
amd::CacheCompilation* cacheCompilation() const { return cacheCompilation_.get(); }
|
|
#endif
|
|
|
|
//! Checks if OCL runtime can use code object manager for compilation
|
|
bool ValidateComgr();
|
|
|
|
virtual amd::Memory *IpcAttach(const void* handle, size_t mem_size, unsigned int flags, void** dev_ptr) const {
|
|
ShouldNotReachHere();
|
|
return nullptr;
|
|
}
|
|
|
|
virtual void IpcDetach(amd::Memory& memory) const {
|
|
ShouldNotReachHere();
|
|
}
|
|
|
|
protected:
|
|
//! Enable the specified extension
|
|
char* getExtensionString();
|
|
|
|
device::Info info_; //!< Device info structure
|
|
device::Settings* settings_; //!< Device settings
|
|
bool online_; //!< The device in online
|
|
BlitProgram* blitProgram_; //!< Blit program info
|
|
static AppProfile appProfile_; //!< application profile
|
|
HwDebugManager* hwDebugMgr_; //!< Hardware Debug manager
|
|
#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
|
//! Compilation with cache support
|
|
std::unique_ptr<amd::CacheCompilation> cacheCompilation_;
|
|
#endif
|
|
|
|
private:
|
|
bool IsTypeMatching(cl_device_type type, bool offlineDevices);
|
|
|
|
#if defined(WITH_HSA_DEVICE)
|
|
static AppProfile* rocAppProfile_;
|
|
#endif
|
|
|
|
static std::vector<Device*>* devices_; //!< All known devices
|
|
|
|
Monitor* vaCacheAccess_; //!< Lock to serialize VA caching access
|
|
std::map<uintptr_t, device::Memory*>* vaCacheMap_; //!< VA cache map
|
|
};
|
|
|
|
#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
|
|
//! Compilation process with cache support.
|
|
class CacheCompilation : public amd::HeapObject {
|
|
public:
|
|
enum COMPILER_OPERATION { LINK_LLVM_BITCODES = 0, COMPILE_TO_LLVM, COMPILE_AND_LINK_EXEC };
|
|
|
|
//! Constructor
|
|
CacheCompilation(std::string targetStr, std::string postfix, bool enableCache, bool resetCache);
|
|
|
|
//! NB, the cacheOpt argument is used for specifying the operation
|
|
//! condition, normally would be the same as the options argument.
|
|
//! However, the cacheOpt argument should not include any option
|
|
//! that would be modified each time but not affect the operation,
|
|
//! e.g. output file name.
|
|
|
|
//! Link LLVM bitcode
|
|
bool linkLLVMBitcode(amd::opencl_driver::Compiler* C,
|
|
std::vector<amd::opencl_driver::Data*>& inputs,
|
|
amd::opencl_driver::Buffer* output, std::vector<std::string>& options,
|
|
std::string& buildLog);
|
|
|
|
//! Compile to LLVM bitcode
|
|
bool compileToLLVMBitcode(amd::opencl_driver::Compiler* C,
|
|
std::vector<amd::opencl_driver::Data*>& inputs,
|
|
amd::opencl_driver::Buffer* output, std::vector<std::string>& options,
|
|
std::string& buildLog);
|
|
|
|
//! Compile and link executable
|
|
bool compileAndLinkExecutable(amd::opencl_driver::Compiler* C,
|
|
std::vector<amd::opencl_driver::Data*>& inputs,
|
|
amd::opencl_driver::Buffer* output,
|
|
std::vector<std::string>& options, std::string& buildLog);
|
|
|
|
private:
|
|
StringCache codeCache_; //! Cached codes
|
|
const bool isCodeCacheEnabled_; //! Code cache enable
|
|
};
|
|
#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
|
|
|
|
/*! @}
|
|
* @}
|
|
*/
|
|
|
|
} // namespace amd
|
|
|
|
#endif /*DEVICE_HPP_*/
|