파일
rocm-systems/rocclr/runtime/device/device.hpp
T
foreman b72b88b381 P4 to Git Change 1057024 by skudchad@skudchad_opencl_win_2 on 2014/07/18 20:55:43
ECR #304775 - Expose cl_khr_subgroups in mainline.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#227 edit
2014-07-18 21:03:27 -04:00

1720 라인
55 KiB
C++

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef DEVICE_HPP_
#define DEVICE_HPP_
#include "top.hpp"
#include "thread/thread.hpp"
#include "thread/monitor.hpp"
#include "platform/context.hpp"
#include "platform/object.hpp"
#include "platform/memory.hpp"
#include "utils/util.hpp"
#include "amdocl/cl_kernel.h"
#include "elf/elf.hpp"
#include "appprofile.hpp"
#include "acl.h"
#include <cstdio>
#include <cstring>
#include <string>
#include <vector>
#include <map>
#include <list>
#include <set>
#include <utility>
namespace amd {
class Command;
class CommandQueue;
class ReadMemoryCommand;
class WriteMemoryCommand;
class FillMemoryCommand;
class CopyMemoryCommand;
class MapMemoryCommand;
class UnmapMemoryCommand;
class MigrateMemObjectsCommand;
class NDRangeKernelCommand;
class NativeFnCommand;
class FlushCommand;
class FinishCommand;
class AcquireExtObjectsCommand;
class ReleaseExtObjectsCommand;
class PerfCounterCommand;
class ReleaseObjectCommand;
class StallQueueCommand;
class Marker;
class KernelSignature;
class ThreadTraceCommand;
class ThreadTraceMemObjectsCommand;
#if cl_amd_open_video
class RunVideoProgramCommand;
class SetVideoSessionCommand;
#endif // cl_amd_open_video
class SignalCommand;
class MakeBuffersResidentCommand;
class SvmFreeMemoryCommand;
class SvmCopyMemoryCommand;
class SvmFillMemoryCommand;
class SvmMapMemoryCommand;
class SvmUnmapMemoryCommand;
class Device;
struct KernelParameterDescriptor;
struct Coord3D;
namespace option {
class Options;
} // option
}
enum OclExtensions {
ClKhrFp64 = 0,
ClAmdFp64,
ClKhrSelectFpRoundingMode,
ClKhrGlobalInt32BaseAtomics,
ClKhrGlobalInt32ExtendedAtomics,
ClKhrLocalInt32BaseAtomics,
ClKhrLocalInt32ExtendedAtomics,
ClKhrInt64BaseAtomics,
ClKhrInt64ExtendedAtomics,
ClKhr3DImageWrites,
ClKhrByteAddressableStore,
ClKhrFp16,
ClKhrGlSharing,
ClKhrGLDepthImages,
ClExtDeviceFission,
ClExtAtomicCounters32,
ClAmdDeviceAttributeQuery,
ClAmdVec3,
ClAmdPrintf,
ClAmdMediaOps,
ClAmdMediaOps2,
ClAmdPopcnt,
#if defined(_WIN32)
ClKhrD3d10Sharing,
ClKhrD3d11Sharing,
ClKhrD3d9Sharing,
#endif
ClKhrImage2dFromBuffer,
ClAmdImage2dFromBufferReadOnly,
ClAmdOpenVideo,
ClAmdSemaphore,
ClAMDBusAddressableMemory,
ClAMDC11Atomics,
ClKhrSpir,
ClKhrSubGroups,
ClKhrGlEvent,
ClExtTotal
};
static const char*
OclExtensionsString[] = {
"cl_khr_fp64 ", "cl_amd_fp64 ",
"cl_khr_select_fprounding_mode ",
"cl_khr_global_int32_base_atomics ",
"cl_khr_global_int32_extended_atomics ",
"cl_khr_local_int32_base_atomics ",
"cl_khr_local_int32_extended_atomics ",
"cl_khr_int64_base_atomics ",
"cl_khr_int64_extended_atomics ",
"cl_khr_3d_image_writes ",
"cl_khr_byte_addressable_store ",
"cl_khr_fp16 ",
"cl_khr_gl_sharing ",
"cl_khr_gl_depth_images ",
"cl_ext_device_fission ",
"cl_ext_atomic_counters_32 ",
"cl_amd_device_attribute_query ",
"cl_amd_vec3 ",
"cl_amd_printf ",
"cl_amd_media_ops ",
"cl_amd_media_ops2 ",
"cl_amd_popcnt ",
#if defined(_WIN32)
"cl_khr_d3d10_sharing ",
"cl_khr_d3d11_sharing ",
"cl_khr_dx9_media_sharing ",
#endif
"cl_khr_image2d_from_buffer ",
"cl_amd_image2d_from_buffer_read_only ",
"cl_amd_open_video ",
"",
"cl_amd_bus_addressable_memory ",
"cl_amd_c11_atomics ",
"cl_khr_spir ",
"cl_khr_subgroups ",
"cl_khr_gl_event ",
NULL
};
namespace device {
class ClBinary;
class BlitManager;
struct PartitionType : public amd::EmbeddedObject
{
enum {
EQUALLY = (1 << 0),
BY_COUNTS = (1 << 1),
BY_AFFINITY_DOMAIN = (1 << 2)
};
union {
struct {
uint equally_ : 1;
uint byCounts_ : 1;
uint byAffinityDomain_ : 1;
};
uint value_;
};
size_t getNumSet() const { return (size_t)amd::countBitsSet(value_); }
cl_device_partition_property toCL() const;
size_t toCL(cl_device_partition_property* types) const;
#ifdef cl_ext_device_fission
cl_device_partition_property_ext toCLExt() const;
size_t toCLExt(cl_device_partition_property_ext* types) const;
#endif
};
struct AffinityDomain : public amd::EmbeddedObject
{
enum {
AFFINITY_DOMAIN_NUMA = (1 << 0),
AFFINITY_DOMAIN_L4_CACHE = (1 << 1),
AFFINITY_DOMAIN_L3_CACHE = (1 << 2),
AFFINITY_DOMAIN_L2_CACHE = (1 << 3),
AFFINITY_DOMAIN_L1_CACHE = (1 << 4),
AFFINITY_DOMAIN_NEXT_PARTITIONABLE = (1 << 5)
};
union {
struct {
uint numa_ : 1;
uint cacheL4_ : 1;
uint cacheL3_ : 1;
uint cacheL2_ : 1;
uint cacheL1_ : 1;
uint next_ : 1;
};
uint value_;
};
size_t getNumSet() const { return (size_t)amd::countBitsSet(value_); }
cl_device_affinity_domain toCL() const;
#ifdef cl_ext_device_fission
cl_device_partition_property_ext toCLExt() const;
size_t toCLExt(cl_device_partition_property_ext* affinities) const;
#endif
};
//! Device partition properties.
struct PartitionInfo : public amd::EmbeddedObject
{
PartitionType type_;
union {
struct {
size_t numComputeUnits_;
} equally_;
AffinityDomain byAffinityDomain_;
struct {
const cl_uint* countsList_;
size_t listSize_;
} byCounts_;
};
};
//! Create Sub-Devices request properties.
struct CreateSubDevicesInfo : public amd::HeapObject
{
PartitionInfo p_;
virtual cl_uint countsListAt(size_t i) const = 0;
virtual ~CreateSubDevicesInfo() {}
};
template <typename PROP_T>
struct CreateSubDevicesInfoT : public CreateSubDevicesInfo
{
virtual cl_uint countsListAt(size_t i) const {
return (cl_uint)reinterpret_cast<const PROP_T*>(p_.byCounts_.countsList_)[i];
}
void initCountsList(const PROP_T* props) {
p_.byCounts_.countsList_ = reinterpret_cast<const cl_uint*>(props);
p_.byCounts_.listSize_ = 0;
for (; *props != ((PROP_T)0); ++props) {
++p_.byCounts_.listSize_;
}
}
};
//! Physical device properties.
struct Info : public amd::EmbeddedObject
{
//! The OpenCL device type.
cl_device_type type_;
//! A unique device vendor identifier.
cl_uint vendorId_;
//! The number of parallel compute cores on the compute device.
cl_uint maxComputeUnits_;
//! Maximum dimensions that specify the global and local work-item IDs
// used by the data-parallel execution model.
cl_uint maxWorkItemDimensions_;
//! Maximum number of work-items that can be specified in each dimension
// to clEnqueueNDRangeKernel.
size_t maxWorkItemSizes_[3];
//! Maximum number of work-items in a work-group executing a kernel
// using the data-parallel execution model.
size_t maxWorkGroupSize_;
//! Number of shader engines in physical GPU
size_t numberOfShaderEngines;
//! cl_uint Preferred native vector width size for built-in scalar types
// that can be put into vectors.
cl_uint preferredVectorWidthChar_;
cl_uint preferredVectorWidthShort_;
cl_uint preferredVectorWidthInt_;
cl_uint preferredVectorWidthLong_;
cl_uint preferredVectorWidthFloat_;
cl_uint preferredVectorWidthDouble_;
cl_uint preferredVectorWidthHalf_;
//! Returns the native ISA vector width. The vector width is defined as the
// number of scalar elements that can be stored in the vector.
cl_uint nativeVectorWidthChar_;
cl_uint nativeVectorWidthShort_;
cl_uint nativeVectorWidthInt_;
cl_uint nativeVectorWidthLong_;
cl_uint nativeVectorWidthFloat_;
cl_uint nativeVectorWidthDouble_;
cl_uint nativeVectorWidthHalf_;
//! Maximum configured clock frequency of the device in MHz.
cl_uint maxClockFrequency_;
//! Describes the address spaces supported by the device.
cl_uint addressBits_;
//! Max number of simultaneous image objects that can be read by a
// kernel.
cl_uint maxReadImageArgs_;
//! Max number of simultaneous image objects that can be written to
// by a kernel.
cl_uint maxWriteImageArgs_;
//! Max number of simultaneous image objects that can be read/written to
// by a kernel.
cl_uint maxReadWriteImageArgs_;
//! Max size of memory object allocation in bytes.
cl_ulong maxMemAllocSize_;
//! Max width of 2D image in pixels.
size_t image2DMaxWidth_;
//! Max height of 2D image in pixels.
size_t image2DMaxHeight_;
//! Max width of 3D image in pixels.
size_t image3DMaxWidth_;
//! Max height of 3D image in pixels.
size_t image3DMaxHeight_;
//! Max depth of 3D image in pixels.
size_t image3DMaxDepth_;
//! Describes whether images are supported
cl_bool imageSupport_;
//! Max size in bytes of the arguments that can be passed to a kernel.
size_t maxParameterSize_;
//! Maximum number of samplers that can be used in a kernel.
cl_uint maxSamplers_;
//! Describes the alignment in bits of the base address of any
// allocated memory object.
cl_uint memBaseAddrAlign_;
//! The smallest alignment in bytes which can be used for any data type.
cl_uint minDataTypeAlignSize_;
//! Describes single precision floating point capability of the device.
cl_device_fp_config halfFPConfig_;
cl_device_fp_config singleFPConfig_;
cl_device_fp_config doubleFPConfig_;
//! Type of global memory cache supported.
cl_device_mem_cache_type globalMemCacheType_;
//! Size of global memory cache line in bytes.
cl_uint globalMemCacheLineSize_;
//! Size of global memory cache in bytes.
cl_ulong globalMemCacheSize_;
//! Size of global device memory in bytes.
cl_ulong globalMemSize_;
//! Max size in bytes of a constant buffer allocation.
cl_ulong maxConstantBufferSize_;
//! Max number of arguments declared
cl_uint maxConstantArgs_;
//! This is used to determine the type of local memory that is available
cl_device_local_mem_type localMemType_;
//! Size of local memory arena in bytes.
cl_ulong localMemSize_;
//! If enabled, implies that all the memories, caches, registers etc. in
// the device implement error correction.
cl_bool errorCorrectionSupport_;
//! CL_TRUE if the device and the host have a unified memory subsystem and
// is CL_FALSE otherwise.
cl_bool hostUnifiedMemory_;
//! Describes the resolution of device timer.
size_t profilingTimerResolution_;
//! Timer starting point offset to Epoch.
cl_ulong profilingTimerOffset_;
//! CL_TRUE if device is a little endian device.
cl_bool littleEndian_;
//! If enabled, implies that commands can be submitted to command-queues
// created on this device.
cl_bool available_;
//! if the implementation does not have a compiler available to compile
// the program source.
cl_bool compilerAvailable_;
//! Describes the execution capabilities of the device.
cl_device_exec_capabilities executionCapabilities_;
//! Describes the SVM capabilities of the device.
cl_device_svm_capabilities svmCapabilities_;
//! Preferred alignment for OpenCL fine-grained SVM atomic types.
cl_uint preferredPlatformAtomicAlignment_;
//! Preferred alignment for OpenCL global atomic types.
cl_uint preferredGlobalAtomicAlignment_;
//! Preferred alignment for OpenCL local atomic types.
cl_uint preferredLocalAtomicAlignment_;
//! Describes the command-queue properties supported of the host queue.
cl_command_queue_properties queueProperties_;
//! The platform associated with this device
cl_platform_id platform_;
//! Device name string
char name_[0x40];
//! Vendor name string
char vendor_[0x20];
//! OpenCL software driver version string in the form major.minor
char driverVersion_[0x20];
//! Returns the profile name supported by the device.
const char* profile_;
//! Returns the OpenCL version supported by the device.
const char* version_;
//! The highest OpenCL C version supported by the compiler for this device.
const char* oclcVersion_;
//! Returns a space separated list of extension names.
const char* extensions_;
//! Returns if device linker is available
cl_bool linkerAvailable_;
//! Returns the list of built-in kernels, supported by the device
const char* builtInKernels_;
//! Returns max number of pixels for a 1D image created from a buffer object
size_t imageMaxBufferSize_;
//! Returns max number of images in a 1D or 2D image array
size_t imageMaxArraySize_;
//! Returns the list of partition types supported by device
PartitionType partitionProperties_;
//! Returns the list of supported affinity domains for
//! partitioning the device using CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN
AffinityDomain affinityDomain_;
//! Returns the properties argument specified in clCreateSubDevices
//! if device is a subdevice.
PartitionInfo partitionCreateInfo_;
//! Returns CL_TRUE if the devices preference is for the user to be
//! responsible for synchronization
cl_bool preferredInteropUserSync_;
//! Returns maximum size of the internal buffer that holds the output
//! of printf calls from a kernel
size_t printfBufferSize_;
//! Indicates whether UVD interop is supported
cl_bool openVideo_;
//! Indicates maximum number of supported global atomic counters
cl_uint maxAtomicCounters_;
#if cl_amd_open_video
// Decoder
//! Maximum number of simultaneous video sessions/streams
cl_uint maxVideoSessions_;
//! List of supported video attributes (profile/format pairs)
cl_video_attrib_amd* videoAttribs_;
cl_uint numVideoAttribs_;
//Encoder
cl_video_attrib_encode_amd* videoEncAttribs_;
cl_uint numVideoEncAttribs_;
#endif //cl_amd_open_video
//! Returns the topology for the device
cl_device_topology_amd deviceTopology_;
//! Semaphore information
cl_uint maxSemaphores_;
cl_uint maxSemaphoreSize_;
//! Returns the SKU board name for the device
char boardName_[128];
//! Number of SIMD (Single Instruction Multiple Data) units per compute unit
//! that execute in parallel. All work items from the same work group must be
//! executed by SIMDs in the same compute unit.
cl_uint simdPerCU_;
//! The maximum number of work items from the same work group that can be
//! executed by a SIMD in parallel
cl_uint simdWidth_;
//! The number of instructions that a SIMD can execute in parallel
cl_uint simdInstructionWidth_;
//! The number of workitems per wavefront
cl_uint wavefrontWidth_;
//! Number of global memory channels
cl_uint globalMemChannels_;
//! Number of banks in each global memory channel
cl_uint globalMemChannelBanks_;
//! Width in bytes of each of global memory bank
cl_uint globalMemChannelBankWidth_;
//! Local memory size per CU
cl_uint localMemSizePerCU_;
//! Number of banks of local memory
cl_uint localMemBanks_;
cl_uint gfxipVersion_; //!< The core engine GFXIP version
//! Thread trace enable
cl_bool threadTraceEnable_;
//! Image pitch alignment for image2d_from_buffer
cl_uint imagePitchAlignment_;
//! Image base address alignment for image2d_from_buffer
cl_uint imageBaseAddressAlignment_;
//! Describes whether buffers from images are supported
cl_bool bufferFromImageSupport_;
//! Returns the supported SPIR versions for the device
const char* spirVersions_;
//! OpenCL20 device info fields:
//! The max number of pipe objects that can be passed as arguments to a kernel
cl_uint maxPipeArgs_;
//! The max number of reservations that can be active for a pipe per work-item in a kernel
cl_uint maxPipeActiveReservations_;
//! The max size of pipe packet in bytes
cl_uint maxPipePacketSize_;
//! The command-queue properties supported of the device queue.
cl_command_queue_properties queueOnDeviceProperties_;
//! The preferred size of the device queue in bytes
cl_uint queueOnDevicePreferredSize_;
//! The max size of the device queue in bytes
cl_uint queueOnDeviceMaxSize_;
//! The maximum number of device queues
cl_uint maxOnDeviceQueues_;
//! The maximum number of events in use on a device queue
cl_uint maxOnDeviceEvents_;
//! The maximum size of global scope variables
size_t maxGlobalVariableSize_;
size_t globalVariablePreferredTotalSize_;
};
//! Device settings
class Settings : public amd::HeapObject
{
public:
uint64_t extensions_; //!< Supported OCL extensions
union {
struct {
uint partialDispatch_: 1; //!< Enables partial dispatch
uint supportRA_: 1; //!< Support RA channel order format
uint largeHostMemAlloc_: 1; //!< Allow large host mem allocations (> maxSingleAlloc)
uint waitCommand_: 1; //!< Enables a wait for every submitted command
uint depthMSAAInterop_: 1; //!< Enable depth-stencil and MSAA interop CL-GL inteorp
uint customHostAllocator_: 1;//!< True if device has custom host allocator
// that replaces generic OS allocation routines
uint customSvmAllocator_: 1; //!< True if device has custom SVM allocator
uint supportDepthsRGB_: 1; //!< Support DEPTH and sRGB channel order format
uint assumeAliases_: 1; //!< Assume aliases in the compilation process
uint reserved_: 23;
};
uint value_;
};
//! Default constructor
Settings();
//! Check the specified extension
bool checkExtension(uint name) const
{ return (extensions_ & (static_cast<uint64_t>(1) << name)) ? true : false; }
//! Enable the specified extension
void enableExtension(uint name) { extensions_ |= static_cast<uint64_t>(1) << name; }
private:
//! Disable copy constructor
Settings(const Settings&);
//! Disable assignment
Settings& operator=(const Settings&);
};
//! Device-independent cache memory, base class for the device-specific
//! memories. One Memory instance refers to one or more of these.
class Memory : public amd::HeapObject
{
public:
//! Resource map flags
enum CpuMapFlags
{
CpuReadWrite = 0x00000000, //!< Lock for CPU read/Write
CpuReadOnly = 0x00000001, //!< Lock for CPU read only operation
CpuWriteOnly = 0x00000002, //!< Lock for CPU write only operation
};
union SyncFlags {
struct {
uint skipParent_ : 1; //!< Skip parent synchronization
uint skipViews_ : 1; //!< Skip views synchronization
uint skipEntire_ : 1; //!< Skip entire synchronization
};
uint value_;
SyncFlags(): value_(0) {}
};
struct WriteMapInfo: public amd::EmbeddedObject
{
amd::Coord3D origin_; //!< Origin of the map location
amd::Coord3D region_; //!< Mapped region
bool entire_; //!< True if the enitre memory was mapped
WriteMapInfo(): origin_(0, 0, 0), region_(0, 0, 0), entire_(false) {}
};
//! Constructor (from an amd::Memory object).
Memory(amd::Memory& owner)
: flags_(0)
, owner_(&owner)
, version_(0)
, mapMemory_(NULL)
, indirectMapCount_(0)
{
size_ = owner.getSize();
}
//! Constructor (no owner), always eager allocation.
Memory(size_t size)
: flags_(0)
, owner_(NULL)
, version_(0)
, mapMemory_(NULL)
, indirectMapCount_(0)
, size_(size)
{
}
enum GLResourceOP
{
GLDecompressResource = 0, // orders the GL driver to decompress any depth-stencil or MSAA resource to be sampled by a CL kernel.
GLInvalidateFBO // orders the GL driver to invalidate any FBO the resource may be bound to, since the resource internal state changed.
};
//! Default destructor for the device memory object
virtual ~Memory() {};
//! Releases virtual objects associated with this memory
void releaseVirtual();
//! Read the size
size_t size() const {return size_;}
//! Gets the owner Memory instance
amd::Memory* owner() const { return owner_; }
//! Immediate blocking write from device cache to owners's backing store.
//! Marks owner as "current" by resetting the last writer to NULL.
virtual void syncHostFromCache(
SyncFlags syncFlags = SyncFlags()
) {}
virtual bool pinSystemMemory(
void* hostPtr, //!< System memory address
size_t size //!< Size of allocated system memory
) { return true; }
//! Releases indirect map surface
virtual void releaseIndirectMap() {}
//! decompress any MSAA/depth-stencil interop surfaces.
//! notify GL to invalidate any surfaces touched by a CL kernel
virtual bool processGLResource(GLResourceOP operation) { return false;}
//! Map the device memory to CPU visible
virtual void* cpuMap(
VirtualDevice& vDev, //!< Virtual device for map operaiton
uint flags = 0, //!< flags for the map operation
// Optimization for multilayer map/unmap
uint startLayer = 0, //!< Start layer for multilayer map
uint numLayers = 0, //!< End layer for multilayer map
size_t* rowPitch = NULL,//!< Row pitch for the device memory
size_t* slicePitch = NULL //!< Slice pitch for the device memory
)
{
amd::Image* image = owner()->asImage();
if (image != NULL) {
*rowPitch = image->getRowPitch();
*slicePitch = image->getSlicePitch();
}
// Default behavior uses preallocated host mem for CPU
return owner()->getHostMem();
}
//! Unmap the device memory
virtual void cpuUnmap(
VirtualDevice& vDev //!< Virtual device for unmap operaiton
) {}
//! Saves map info for this object
//! @note: It's not a thread safe operation, the app must implement
//! synchronization for the multiple write maps if necessary
void saveWriteMapInfo(
const amd::Coord3D origin, //!< Origin of the map location
const amd::Coord3D region, //!< Mapped region
bool entire //!< True if the enitre memory was mapped
);
const WriteMapInfo* writeMapInfo() const { return &writeMapInfo_; }
//! Clear memory object as mapped read only
void clearUnmapWrite() { flags_ &= ~UnmapWrite; }
//! Returns state of map read only flag
bool isUnmapWrite() const { return (flags_ & UnmapWrite) ? true : false; }
//! Returns state of memory direct access flag
bool isHostMemDirectAccess() const
{ return (flags_ & HostMemoryDirectAccess) ? true : false; }
//! Returns state of host memory registration flag
bool isHostMemoryRegistered() const
{ return (flags_ & HostMemoryRegistered) ? true : false; }
protected:
enum Flags {
HostMemoryDirectAccess = 0x00000001, //!< GPU has direct access to the host memory
MapResourceAlloced = 0x00000002, //!< Map resource was allocated
PinnedMemoryAlloced = 0x00000004, //!< An extra pinned resource was allocated
UnmapWrite = 0x00000008, //!< Memory was mapped read-only
SubMemoryObject = 0x00000010, //!< Memory is sub-memory
HostMemoryRegistered = 0x00000020, //!< Host memory was registered
};
uint flags_; //!< Memory object flags
amd::Memory* owner_; //!< The Memory instance that we cache,
//!< or NULL if we're device-private workspace.
volatile size_t version_; //!< The version we're currently shadowing
//! NB, the map data below is for an API-level map (from clEnqueueMapBuffer),
//! not a physical map. When a memory object does not use USE_HOST_PTR we
//! can use a remote resource and DMA, avoiding the additional CPU memcpy.
amd::Memory* mapMemory_; //!< Memory used as map target buffer
volatile size_t indirectMapCount_; //!< Number of maps
WriteMapInfo writeMapInfo_; //!< Saved write map info for partial unmap
//! Increment map count
void incIndMapCount() { ++indirectMapCount_; }
//! Decrement map count
virtual void decIndMapCount() {}
private:
//! Disable default copy constructor
Memory& operator=(const Memory&);
//! Disable operator=
Memory(const Memory&);
//! Our size
size_t size_;
};
class Sampler : public amd::HeapObject
{
public:
//! Constructor
Sampler() {}
//! Default destructor for the device memory object
virtual ~Sampler() {};
//! Returns device specific HW state for the sampler
uint64_t hwSrd() const { return hwSrd_; }
protected:
uint64_t hwSrd_; //!< Device specific HW state for the sampler
private:
//! Disable default copy constructor
Sampler& operator=(const Sampler&);
//! Disable operator=
Sampler(const Sampler&);
};
//! \class DeviceKernel, which will contain the common fields for any device
class Kernel : public amd::HeapObject
{
public:
typedef std::vector<amd::KernelParameterDescriptor> parameters_t;
//! \struct The device kernel workgroup info structure
struct WorkGroupInfo : public amd::EmbeddedObject
{
size_t size_; //!< kernel workgroup size
size_t compileSize_[3]; //!< kernel compiled workgroup size
cl_ulong localMemSize_; //!< amount of used local memory
size_t preferredSizeMultiple_; //!< preferred multiple for launch
cl_ulong privateMemSize_; //!< amount of used private memory
size_t scratchRegs_; //!< amount of used scratch registers
size_t wavefrontPerSIMD_; //!< number of wavefronts per SIMD
size_t wavefrontSize_; //!< number of threads per wavefront
size_t availableGPRs_; //!< GPRs available to the program
size_t usedGPRs_; //!< GPRs used by the program
size_t availableSGPRs_; //!< SGPRs available to the program
size_t usedSGPRs_; //!< SGPRs used by the program
size_t availableVGPRs_; //!< VGPRs available to the program
size_t usedVGPRs_; //!< VGPRs used by the program
size_t availableLDSSize_; //!< available LDS size
size_t usedLDSSize_; //!< used LDS size
size_t availableStackSize_; //!< available stack size
size_t usedStackSize_; //!< used stack size
size_t compileSizeHint_[3]; //!< kernel compiled workgroup size hint
std::string compileVecTypeHint_; //!< kernel compiled vector type hint
bool uniformWorkGroupSize_; //!< uniform work group size option
};
//! Default constructor
Kernel(const std::string& name): name_(name), signature_(NULL), hsa_(false)
{ memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_)); }
//! Default destructor
virtual ~Kernel();
//! Validates memory argument
virtual bool validateMemory(
uint idx, //!< Argument's index
amd::Memory* amdMem //!< memory object for validation
) const { return true; }
//! Returns the kernel info structure
const WorkGroupInfo* workGroupInfo() const { return &workGroupInfo_; }
//! Returns the kernel signature
const amd::KernelSignature& signature() const { return *signature_; }
//! Returns the kernel name
const std::string& name() const { return name_; }
//! Initializes the kernel parameters for the abstraction layer
bool createSignature(const parameters_t& params);
//! Returns TRUE if it's a HSA kernel
bool hsa() const { return hsa_; }
void setUniformWorkGroupSize(bool u)
{
workGroupInfo_.uniformWorkGroupSize_ = u;
}
bool getUniformWorkGroupSize() const {
return workGroupInfo_.uniformWorkGroupSize_;
}
void setReqdWorkGroupSize(size_t x, size_t y, size_t z)
{
workGroupInfo_.compileSize_[0] = x;
workGroupInfo_.compileSize_[1] = y;
workGroupInfo_.compileSize_[2] = z;
}
size_t getReqdWorkGroupSize(int dim) {
return workGroupInfo_.compileSize_[dim];
}
void setWorkGroupSizeHint(size_t x, size_t y, size_t z)
{
workGroupInfo_.compileSizeHint_[0] = x;
workGroupInfo_.compileSizeHint_[1] = y;
workGroupInfo_.compileSizeHint_[2] = z;
}
size_t getWorkGroupSizeHint(int dim) const {
return workGroupInfo_.compileSizeHint_[dim];
}
void setVecTypeHint(const std::string& hint)
{
workGroupInfo_.compileVecTypeHint_ = hint;
}
void setLocalMemSize(size_t size)
{
workGroupInfo_.localMemSize_ = size;
}
void setPreferredSizeMultiple(size_t size)
{
workGroupInfo_.preferredSizeMultiple_ = size;
}
protected:
std::string name_; //!< kernel name
WorkGroupInfo workGroupInfo_; //!< device kernel info structure
amd::KernelSignature* signature_; //!< kernel signature
bool hsa_; //!< True if HSA kernel on GPU
private:
//! Disable default copy constructor
Kernel(const Kernel&);
//! Disable operator=
Kernel& operator=(const Kernel&);
};
//! A program object for a specific device.
class Program : public amd::HeapObject
{
public:
typedef std::pair<const void*, size_t> binary_t;
typedef std::map<std::string, Kernel*> kernels_t;
// type of the program
typedef enum {
TYPE_NONE = 0, // uncompiled
TYPE_COMPILED, // compiled
TYPE_LIBRARY, // linked library
TYPE_EXECUTABLE, // linked executable
TYPE_INTERMEDIATE // intermediate
} type_t;
private:
//! The device target for this binary.
amd::SharedReference<amd::Device> device_;
kernels_t kernels_; //!< The kernel entry points this binary.
type_t type_; //!< type of this program
protected:
ClBinary* clBinary_; //!< The CL program binary file
std::string llvmBinary_; //!< LLVM IR binary code
bool llvmBinaryIsSpir_; //!< LLVM IR binary code is in SPIR format
std::string compileOptions_;//!< compile/build options.
std::string linkOptions_; //!< link options.
//!< the option arg passed in to clCompileProgram(), clLinkProgram(),
//! or clBuildProgram(), whichever is called last
std::string lastBuildOptionsArg_;
std::string buildLog_; //!< build log.
cl_int buildStatus_; //!< build status.
cl_int buildError_; //!< build error
//! The info target for this binary.
aclTargetInfo info_;
size_t globalVariableTotalSize_;
public:
//! Construct a section.
Program(amd::Device& device);
//! Destroy this binary image.
virtual ~Program();
//! Destory all the kernels
void clear();
//! Return the compiler options passed to build this program
amd::option::Options* getCompilerOptions() const { return programOptions; }
//! Compile the device program.
cl_int compile(const std::string& sourceCode,
const std::vector<const std::string*>& headers,
const char** headerIncludeNames,
const char* origOptions,
amd::option::Options* options);
//! Builds the device program.
cl_int link(const std::vector<Program*>& inputPrograms,
const char* origOptions,
amd::option::Options* options);
//! Builds the device program.
cl_int build(const std::string& sourceCode,
const char* origOptions,
amd::option::Options* options);
//! Returns the device object, associated with this program.
const amd::Device& device() { return device_(); }
//! Return the compiler options used to build the program.
const std::string& compileOptions() const { return compileOptions_; }
//! Return the option arg passed in to clCompileProgram(), clLinkProgram(),
//! or clBuildProgram(), whichever is called last
const std::string lastBuildOptionsArg() const {
return lastBuildOptionsArg_;
}
//! Return the build log.
const std::string& buildLog() const { return buildLog_; }
//! Return the build status.
cl_build_status buildStatus() const { return buildStatus_; }
//! Return the build error.
cl_int buildError() const { return buildError_; }
//! Return the symbols vector.
const kernels_t& kernels() const { return kernels_; }
kernels_t& kernels() { return kernels_; }
//! Return the binary image.
inline const binary_t binary() const;
inline binary_t binary();
//! Returns the CL program binary file
ClBinary* clBinary() { return clBinary_; }
const ClBinary* clBinary() const { return clBinary_; }
bool setBinary(char* binaryIn, size_t size);
type_t type() const { return type_; }
void setGlobalVariableTotalSize(size_t size) {
globalVariableTotalSize_ = size;
}
size_t globalVariableTotalSize() const {
return globalVariableTotalSize_;
}
protected:
//! pre-compile setup
virtual bool initBuild(amd::option::Options* options);
//! post-compile cleanup
virtual bool finiBuild(bool isBuildGood);
//! Compile the device program.
virtual bool compileImpl(const std::string& sourceCode,
const std::vector<const std::string*>& headers,
const char** headerIncludeNames,
amd::option::Options* options) = 0;
//! Link the device program.
virtual bool linkImpl(amd::option::Options* options) = 0;
//! Link the device programs.
virtual bool linkImpl(const std::vector<Program*>& inputPrograms,
amd::option::Options* options,
bool createLibrary) = 0;
virtual bool createBinary(amd::option::Options* options) = 0;
virtual bool createBIFBinary(aclBinary* bin);
//! Initialize Binary (used only for clCreateProgramWithBinary()).
bool initClBinary(char* binaryIn, size_t size);
//! Initialize Binary
virtual bool initClBinary() = 0;
//! Release the Binary
virtual void releaseClBinary() = 0;
//! return target info
virtual const aclTargetInfo & info(const char * str = "") = 0;
virtual bool isElf(const char* bin) const = 0;
//! At linking time, get the set of compile options to be used from
//! the set of input program, warn if they have inconsisten compile
//! options.
bool getCompileOptionsAtLinking(const std::vector<Program*>& inputPrograms,
const amd::option::Options* linkOptions);
void setType(type_t newType) { type_ = newType; }
private:
//! Disable default copy constructor
Program(const Program&);
//! Disable operator=
Program& operator=(const Program&);
public:
amd::option::Options* programOptions;
};
class ClBinary : public amd::HeapObject
{
public:
enum BinaryImageFormat {
BIF_VERSION2 = 0, //!< Binary Image Format version 2.0 (ELF)
BIF_VERSION3 //!< Binary Image Format version 3.0 (ELF)
};
//! Constructor
ClBinary(const amd::Device& dev, BinaryImageFormat bifVer = BIF_VERSION2);
//! Destructor
virtual ~ClBinary();
void init(amd::option::Options* optionsObj, bool amdilRequired = false);
/** called only in loading image routines,
never called in storing routines */
bool setBinary(char* theBinary, size_t theBinarySize, bool allocated=false);
//! setin elfIn_
bool setElfIn(unsigned char eclass);
void resetElfIn();
//! set out elf
bool setElfOut(unsigned char eclass, const char* outFile);
void resetElfOut();
//! Set elf header information
virtual bool setElfTarget() = 0;
// class used in for loading images in new format
amd::OclElf* elfIn() { return elfIn_; }
// classes used storing and loading images in new format
amd::OclElf* elfOut() { return elfOut_; }
void elfOut(amd::OclElf* v) { elfOut_ = v; }
//! Create and save ELF binary image
bool createElfBinary(bool doencrypt, Program::type_t type);
//save BIF binary image
void saveBIFBinary(char* binaryIn, size_t size);
bool decryptElf(char* binaryIn, size_t size,
char** decryptBin, size_t* decryptSize, int* encryptCode);
//! Returns the binary pair for the abstraction layer
Program::binary_t data() const;
//! Loads llvmir binary from OCL binary file
bool loadLlvmBinary(
std::string& llvmBinary, //!< LLVMIR binary code
bool& llvmBinaryIsSpir //!< LLVMIR binary is in SPIR format
) const;
//! Loads compile options from OCL binary file
bool loadCompileOptions(
std::string& compileOptions //!< return the compile options loaded
) const;
//! Loads link options from OCL binary file
bool loadLinkOptions(
std::string& linkOptions //!< return the link options loaded
) const;
//! Store compile options into OCL binary file
void storeCompileOptions(
const std::string& compileOptions //!< the compile options to be stored
);
//! Store link options into OCL binary file
void storeLinkOptions(
const std::string& linkOptions //!< the link options to be stored
);
//! Check if the binary is recompilable
bool isRecompilable(std::string& llvmBinary,
amd::OclElf::oclElfPlatform thePlatform);
void saveOrigBinary(char* origBinary, size_t origSize) {
origBinary_ = origBinary;
origSize_ = origSize;
}
void restoreOrigBinary() {
if (origBinary_ != NULL) {
(void)setBinary(origBinary_, origSize_, false);
}
}
//! Set Binary flags
void setFlags(int encryptCode);
bool saveSOURCE () {
return ((flags_ & BinarySourceMask) == BinarySaveSource);
}
bool saveLLVMIR () {
return ((flags_ & BinaryLlvmirMask) == BinarySaveLlvmir);
}
bool saveAMDIL () {
return ((flags_ & BinaryAmdilMask) == BinarySaveAmdil);
}
bool saveISA () {
return ((flags_ & BinaryIsaMask) == BinarySaveIsa);
}
bool saveAS () {
return ((flags_ & BinaryASMask) == BinarySaveAS);
}
// Return the encrypt code for this input binary ( "> 0" means encrypted)
int getEncryptCode() { return encryptCode_; }
// Returns TRUE of binary file is SPIR
bool isSPIR() const;
protected:
enum Flags {
BinaryAllocated = 0x1, //!< Binary was created
// Source control
BinaryNoSaveSource = 0x0, // 0: default
BinaryRemoveSource = 0x2, // for encrypted binary
BinarySaveSource = 0x4,
BinarySourceMask = 0x6,
// LLVMIR control
BinarySaveLlvmir = 0x0, // 0: default
BinaryRemoveLlvmir = 0x8, // for encrypted binary
BinaryNoSaveLlvmir = 0x10,
BinaryLlvmirMask = 0x18,
// AMDIL control
BinarySaveAmdil = 0x0, // 0: default
BinaryRemoveAmdil = 0x20, // for encrypted binary
BinaryNoSaveAmdil = 0x40,
BinaryAmdilMask = 0x60,
// ISA control
BinarySaveIsa = 0x0, // 0: default
BinaryRemoveIsa = 0x80, // for encrypted binary
BinaryNoSaveIsa = 0x100,
BinaryIsaMask = 0x180,
// AS control
BinaryNoSaveAS = 0x0, // 0: default
BinaryRemoveAS = 0x200, // for encrypted binary
BinarySaveAS = 0x400,
BinaryASMask = 0x600
};
//! Returns TRUE if binary file was allocated
bool isBinaryAllocated() const
{ return (flags_ & BinaryAllocated) ? true : false; }
//! Returns BIF symbol name by symbolID,
//! returns empty string if not found or if BIF version is unsupported
std::string getBIFSymbol(unsigned int symbolID) const;
protected:
const amd::Device& dev_; //!< Device object
private:
//! Disable default copy constructor
ClBinary(const ClBinary&);
//! Disable default operator=
ClBinary& operator=(const ClBinary&);
//! Releases the binary data store
void release();
char* binary_; //!< binary data
size_t size_; //!< binary size
uint flags_; //!< CL binary object flags
char* origBinary_; //!< original binary data
size_t origSize_; //!< original binary size
int encryptCode_; //!< Encryption Code for input binary (0 for not encrypted)
protected:
amd::OclElf *elfIn_; //!< ELF object for input ELF binary
amd::OclElf *elfOut_; //!< ELF object for output ELF binary
BinaryImageFormat format_; //!< which binary image format to use
};
inline const Program::binary_t Program::binary() const {
if (clBinary() == NULL) {
return std::make_pair((const void*)0, 0);
}
return clBinary()->data();
}
inline Program::binary_t Program::binary() {
if (clBinary() == NULL) {
return std::make_pair((const void*)0, 0);
}
return clBinary()->data();
}
/*! \class PerfCounter
*
* \brief The device interface class for the performance counters
*/
class PerfCounter : public amd::HeapObject
{
public:
//! Constructor for the device performance
PerfCounter() {}
//! Get the performance counter info
virtual uint64_t getInfo(uint64_t infoType) const = 0;
//! Destructor for PerfCounter class
virtual ~PerfCounter() {}
private:
//! Disable default copy constructor
PerfCounter(const PerfCounter&);
//! Disable default operator=
PerfCounter& operator=(const PerfCounter&);
};
/*! \class ThreadTrace
*
* \brief The device interface class for the performance counters
*/
class ThreadTrace : public amd::HeapObject
{
public:
//! Constructor for the device performance
ThreadTrace() {}
//! Update ThreadTrace status to true/false if new buffer was binded/unbinded respectively
virtual void setNewBufferBinded(bool) = 0;
//! Get the performance counter info
virtual bool info(uint infoType, uint* info,uint infoSize) const = 0;
//! Destructor for PerfCounter class
virtual ~ThreadTrace() {}
private:
//! Disable default copy constructor
ThreadTrace(const ThreadTrace&);
//! Disable default operator=
ThreadTrace& operator=(const ThreadTrace&);
};
//! A device execution environment.
class VirtualDevice : public amd::HeapObject
{
public:
//! Construct a new virtual device for the given physical device.
VirtualDevice(amd::Device& device)
: device_(device), blitMgr_(NULL)
{ }
//! Destroy this virtual device.
virtual ~VirtualDevice()
{ }
//! Prepare this virtual device for destruction.
virtual bool terminate() = 0;
//! Return the physical device for this virtual device.
const amd::Device& device() const { return device_(); }
virtual void submitReadMemory(amd::ReadMemoryCommand& cmd) = 0;
virtual void submitWriteMemory(amd::WriteMemoryCommand& cmd) = 0;
virtual void submitCopyMemory(amd::CopyMemoryCommand& cmd) = 0;
virtual void submitMapMemory(amd::MapMemoryCommand& cmd) = 0;
virtual void submitUnmapMemory(amd::UnmapMemoryCommand& cmd) = 0;
virtual void submitKernel(amd::NDRangeKernelCommand& command) = 0;
virtual void submitNativeFn(amd::NativeFnCommand& cmd) = 0;
virtual void submitMarker(amd::Marker& cmd) = 0;
virtual void submitFillMemory(amd::FillMemoryCommand& cmd) = 0;
virtual void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd) = 0;
virtual void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd) = 0;
virtual void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd) = 0;
virtual void submitPerfCounter(amd::PerfCounterCommand& cmd) = 0;
virtual void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) = 0;
virtual void submitThreadTrace(amd::ThreadTraceCommand& cmd) = 0;
virtual void flush(amd::Command* list = NULL, bool wait = false) = 0;
#if cl_amd_open_video
virtual void submitRunVideoProgram(amd::RunVideoProgramCommand& cmd) = 0;
virtual void submitSetVideoSession(amd::SetVideoSessionCommand& cmd) = 0;
#endif // cl_amd_open_video
virtual void submitSignal(amd::SignalCommand & cmd) = 0;
virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand & cmd) = 0;
virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) = 0;
virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) = 0;
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) = 0;
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) = 0;
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) = 0;
//! Get the blit manager object
device::BlitManager& blitMgr() const { return *blitMgr_; }
private:
//! Disable default copy constructor
VirtualDevice& operator=(const VirtualDevice&);
//! Disable operator=
VirtualDevice(const VirtualDevice&);
//! The physical device that this virtual device utilizes
amd::SharedReference<amd::Device> device_;
protected:
device::BlitManager* blitMgr_; //!< Blit manager
};
} // namespace device
namespace amd {
//! SvmManager class
class SvmManager : public AllStatic
{
public:
static size_t size(); //!< obtain the size of the container
static void AddSvmBuffer(const void* k, amd::Memory* v); //!< add the svm pointer and buffer in the container
static void RemoveSvmBuffer(const void* k); //!< Remove an entry of svm info from the container
static amd::Memory* FindSvmBuffer(const void* k); //!< find the svm buffer based on the input pointer
private:
static std::map<uintptr_t, amd::Memory*> svmBufferMap_; //!< the svm space information container
static amd::Monitor AllocatedLock_; //!< amd monitor locker
};
/*! \addtogroup Runtime
* @{
*
* \addtogroup Device Device Abstraction
* @{
*/
class Device : public RuntimeObject
{
public:
typedef std::list<CommandQueue*> CommandQueues;
struct BlitProgram : public amd::HeapObject
{
Program* program_; //!< GPU program obejct
Context* context_; //!< A dummy context
BlitProgram(Context* context): program_(NULL), context_(context) {}
~BlitProgram();
//! Creates blit program for this device
bool create(Device* device, //!< Device object
const char* extraKernel = NULL, //!< Extra kernels from the device layer
const char* extraOptions = NULL //!< Extra compilation options
);
};
virtual aclCompiler* compiler() const = 0;
Device(Device* parent = NULL);
virtual ~Device();
//! Increment the reference count
uint retain() {
// Only increment the reference count of sub-devices
return !isRootDevice() ? RuntimeObject::retain() : 0u;
}
//! Decrement the reference count
uint release() {
// Only decrement the reference count of sub-devices
return !isRootDevice() ? RuntimeObject::release() : 0u;
}
//! Register a device as available
void registerDevice();
//! Initialize the device layer (enumerate known devices)
static bool init();
//! Shutdown the device layer
static void tearDown();
static std::vector<Device*>
getDevices(
cl_device_type type, //!< Device type
bool offlineDevices //!< Enable offline devices
);
static size_t
numDevices(
cl_device_type type, //!< Device type
bool offlineDevices //!< Enable offline devices
);
static bool
getDeviceIDs(
cl_device_type deviceType, //!< Device type
cl_uint numEntries, //!< Number of entries in the array
cl_device_id* devices, //!< Array of the device ID(s)
cl_uint* numDevices, //!< Number of available devices
bool offlineDevices //!< Report offline devices
);
const device::Info& info() const { return info_; }
//! Return this device's type.
cl_device_type type() const {
return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT | CL_HSA_ENABLED_AMD
| CL_HSA_DISABLED_AMD);
}
//! Create sub-devices according to the given partition scheme.
virtual cl_int createSubDevices(
device::CreateSubDevicesInfo& create_info,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) = 0;
//! Create a new virtual device environment.
virtual device::VirtualDevice* createVirtualDevice(
bool profiling,
bool interopQueue
#if cl_amd_open_video
, void* calVideoProperties = NULL
#endif // cl_amd_open_video
, uint deviceQueueSize = 0
) = 0;
//! Compile the given source code.
virtual device::Program* createProgram(int oclVer = 120) = 0;
//! Allocate a chunk of device memory as a cache for a CL memory object
virtual device::Memory* createMemory(Memory& owner) const = 0;
//! Allocate a device sampler object
virtual bool createSampler(const Sampler&, device::Sampler**) const = 0;
//! Allocates a view object from the device memory
virtual device::Memory* createView(
amd::Memory& owner, //!< Owner memory object
const device::Memory& parent //!< Parent device memory object for the view
) const = 0;
//! Reallocates device memory object
virtual bool reallocMemory(Memory& owner) const = 0;
//! Return true if initialized external API interop, otherwise false
virtual bool bindExternalDevice(
intptr_t type, //!< Enum val. for ext.API type: GL, D3D10, etc.
void* pDevice, //!< D3D device do D3D, HDC/Display handle of X Window for GL
void* pContext, //!< HGLRC/GLXContext handle
bool validateOnly //! Only validate if the device can inter-operate with pDevice/pContext, do not bind.
) = 0;
virtual bool unbindExternalDevice(
intptr_t type, //!< Enum val. for ext.API type: GL, D3D10, etc.
void* pDevice, //!< D3D device do D3D, HDC/Display handle of X Window for GL
void* pContext, //!< HGLRC/GLXContext handle
bool validateOnly //! Only validate if the device can inter-operate with pDevice/pContext, do not bind.
) = 0;
//! resolves GL depth/msaa buffer
virtual bool resolveGLMemory(device::Memory*) const { return true; }
//! Gets a pointer to a region of host-visible memory for use as the target
//! of an indirect map for a given memory object
virtual void* allocMapTarget(
amd::Memory& mem, //!< Abstraction layer memory object
const amd::Coord3D& origin, //!< The map location in memory
const amd::Coord3D& region, //!< The map region in memory
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
size_t* slicePitch = NULL //!< Slice for the mapped memory
) = 0;
//! Gets free memory on a GPU device
virtual bool globalFreeMemory(
size_t* freeMemory //!< Free memory information on a GPU device
) const = 0;
/**
* @return True if the device has its own custom host allocator to be used
* instead of the generic OS allocation routines
*/
bool customHostAllocator() const {
return settings().customHostAllocator_ == 1;
}
/**
* @copydoc amd::Context::hostAlloc
*/
virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const
{
ShouldNotCallThis();
return NULL;
}
/**
* @copydoc amd::Context::hostFree
*/
virtual void hostFree(void* ptr, size_t size = 0) const
{
ShouldNotCallThis();
}
/**
* @return True if the device has its own custom SVM allocator
*/
bool customSvmAllocator() const {
return settings().customSvmAllocator_ == 1;
}
/**
* @copydoc amd::Context::svmAlloc
*/
virtual void* svmAlloc(Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags) const = 0;
/**
* @copydoc amd::Context::svmFree
*/
virtual void svmFree(void* ptr) const = 0;
//! Validate kernel
virtual bool validateKernel(
const amd::Kernel& kernel,
const device::VirtualDevice* vdev) { return true; };
//! Returns true if the given binary image is valid for this device.
bool verifyBinaryImage( const void* image, size_t size) const;
//! Returns TRUE if the device is available for computations
bool isOnline() const { return online_; }
//! Returns TRUE if the device is a root device (as opposed to sub-device)
bool isRootDevice() const { return parent_ == NULL; }
//! Returns TRUE if 'this' is an ancestor of the given sub-device.
bool isAncestor(const Device* sub) const;
//! Return the parent device.
Device* parent() const { return parent_; }
//! Return the root device for this instance;
Device& rootDevice() {
Device* root = this;
while (!root->isRootDevice()) {
root = root->parent_;
}
return *root;
}
const Device& rootDevice() const {
const Device* root = this;
while (!root->isRootDevice()) {
root = root->parent_;
}
return *root;
}
//! Returns device settings
const device::Settings& settings() const { return *settings_; }
//! Returns blit program info structure
BlitProgram* blitProgram() const { return blitProgram_; }
//! RTTI internal implementation
virtual ObjectType objectType() const {return ObjectTypeDevice;}
protected:
//! Enable the specified extension
char* getExtensionString();
device::Info info_; //!< Device info structure
device::Settings* settings_; //!< Device settings
bool online_; //!< The device in online
BlitProgram* blitProgram_; //!< Blit program info
private:
bool IsHsaCapableDevice();
bool IsTypeMatching(cl_device_type type, bool offlineDevices);
#if defined(WITH_HSA_DEVICE)
static AppProfile* oclhsaAppProfile_;
#endif
static bool isHsaDeviceAvailable_;
static bool isGpuDeviceAvailable_;
typedef std::vector<Device*>::iterator device_iterator;
static std::vector<Device*>* devices_; //!< All known devices
Device* parent_; //!< This device's parent
};
struct KernelParameterDescriptor
{
const char* name_; //!< The parameter's name in the source
clk_value_type_t type_; //!< The parameter's type
size_t offset_; //!< Its offset in the parameter's stack
size_t size_; //!< Its size in bytes
//! Argument's address qualifier
cl_kernel_arg_address_qualifier addressQualifier_;
//! Argument's access qualifier
cl_kernel_arg_access_qualifier accessQualifier_;
//! Argument's type qualifier
cl_kernel_arg_type_qualifier typeQualifier_;
const char* typeName_; //!< Argument's type name
};
/*! @}
* @}
*/
} // namespace amd
#endif /*DEVICE_HPP_*/