2
0

P4 to Git Change 1144455 by gandryey@gera-dev-w7 on 2015/04/24 19:06:19

ECR #304775 - Fix offline compilation for Hawaii with -cl-fp32-correctly-rounded-divide-sqrt flag
	- check for cl-fp32-correctly-rounded-divide-sqrt support uses device info, but device info was never fully updated for offline devices. This change will update device info structure for offline devices as well.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#508 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#143 edit


[ROCm/clr commit: 8c00d76f96]
Este cometimento está contido em:
foreman
2015-04-24 19:18:07 -04:00
ascendente c15b0c27e1
cometimento 2359fa9fcd
2 ficheiros modificados com 345 adições e 347 eliminações
+332 -337
Ver ficheiro
@@ -120,19 +120,13 @@ NullDevice::create(CALtarget target)
{
CALdeviceattribs calAttr = {0};
CALdeviceVideoAttribs calVideoAttr = {0};
CALdevicestatus calDevStatus = {0};
online_ = false;
// Mark the device as GPU type
info_.type_ = CL_DEVICE_TYPE_GPU;
info_.vendorId_ = 0x1002;
calTarget_ = calAttr.target = target;
hwInfo_ = &DeviceInfo[calTarget_];
// Report the device name
::strcpy(info_.name_, hwInfo()->targetName_);
// Force double if it could be supported
switch (target) {
case CAL_TARGET_CAYMAN:
@@ -177,19 +171,19 @@ NullDevice::create(CALtarget target)
return false;
}
info_.maxWorkGroupSize_ = settings().maxWorkGroupSize_;
// Report 512MB for all offline devices
calDevStatus.availVisibleHeap = 512;
calDevStatus.largestBlockVisibleHeap = 512;
calAttr.localRAM = 512;
// Initialize the extension string for offline devices
info_.extensions_ = getExtensionString();
// Fill the device info structure
fillDeviceInfo(calAttr, calDevStatus, 4096, 1, true
#if cl_amd_open_video
, getVideoAttribs()
#endif //cl_amd_open_video
);
// Fill the version info
::strcpy(info_.name_, hwInfo()->targetName_);
::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1,
AMD_BUILD_STRING);
if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
info_.version_ = "OpenCL 2.0 " AMD_PLATFORM_INFO;
info_.oclcVersion_ = "OpenCL C 2.0 ";
// Runtime doesn't know what local size could be on the real board
info_.maxGlobalVariableSize_ = static_cast<size_t>(512 * Mi);
@@ -214,10 +208,6 @@ NullDevice::create(CALtarget target)
}
}
}
else {
info_.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO;
info_.oclcVersion_ = "OpenCL C 1.2 ";
}
return true;
}
@@ -239,6 +229,324 @@ NullDevice::createProgram(int oclVer)
return nullProgram;
}
void NullDevice::fillDeviceInfo(
const CALdeviceattribs& calAttr,
const CALdevicestatus& calStatus,
size_t maxTextureSize,
uint numComputeRings,
bool isVirtualMode
#if cl_amd_open_video
,
const CALdeviceVideoAttribs& calVideoAttr
#endif // cl_amd_open_video
)
{
info_.type_ = CL_DEVICE_TYPE_GPU;
info_.vendorId_ = 0x1002;
info_.maxComputeUnits_ = calAttr.numberOfSIMD;
info_.maxWorkItemDimensions_ = 3;
info_.numberOfShaderEngines = calAttr.numberOfShaderEngines;
if (settings().siPlus_) {
// SI parts are scalar. Also, reads don't need to be 128-bits to get peak rates.
// For example, float4 is not faster than float as long as all threads fetch the same
// amount of data and the reads are coalesced. This is from the H/W team and confirmed
// through experimentation. May also be true on EG/NI, but no point in confusing
// developers now.
info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4;
info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2;
info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1;
info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1;
info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1;
info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ =
(settings().checkExtension(ClKhrFp64)) ? 1 : 0;
info_.nativeVectorWidthHalf_ = info_.preferredVectorWidthHalf_ = 0; // no half support
}
else {
info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 16;
info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 8;
info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 4;
info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 2;
info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 4;
info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ =
(settings().checkExtension(ClKhrFp64)) ? 2 : 0;
info_.nativeVectorWidthHalf_ = info_.preferredVectorWidthHalf_ = 0; // no half support
}
info_.maxClockFrequency_ = (calAttr.engineClock != 0) ? calAttr.engineClock : 555;
info_.maxParameterSize_ = 1024;
info_.minDataTypeAlignSize_ = sizeof(cl_long16);
info_.singleFPConfig_ = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
| CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA;
if (GPU_FORCE_SINGLE_FP_DENORM) {
info_.singleFPConfig_ |= CL_FP_DENORM;
}
if (settings().checkExtension(ClKhrFp64)) {
info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM;
}
if (settings().reportFMA_) {
info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
}
info_.globalMemCacheLineSize_ = settings().cacheLineSize_;
info_.globalMemCacheSize_ = settings().cacheSize_;
if ((settings().cacheLineSize_ != 0) || (settings().cacheSize_ != 0)) {
info_.globalMemCacheType_ = CL_READ_WRITE_CACHE;
}
else {
info_.globalMemCacheType_ = CL_NONE;
}
if (isVirtualMode) {
#if defined(ATI_OS_LINUX)
info_.globalMemSize_ =
(static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
// globalMemSize is the actual available size for app on Linux
// Because Linux base driver doesn't support paging
static_cast<cl_ulong>(calStatus.availVisibleHeap +
calStatus.availInvisibleHeap) / 100u) * Mi;
#else
info_.globalMemSize_ =
(static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
static_cast<cl_ulong>(calAttr.localRAM) / 100u) * Mi;
#endif
if (settings().apuSystem_) {
info_.globalMemSize_ +=
(static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi);
}
// We try to calculate the largest available memory size from
// the largest available block in either heap. In theory this
// should be the size we can actually allocate at application
// start. Note that it may not be a guarantee still as the
// application progresses.
info_.maxMemAllocSize_ = std::max(
cl_ulong(calStatus.largestBlockVisibleHeap * Mi),
cl_ulong(calStatus.largestBlockInvisibleHeap * Mi));
#if defined(ATI_OS_WIN)
if (settings().apuSystem_) {
info_.maxMemAllocSize_ = std::max(
(static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi),
info_.maxMemAllocSize_);
}
#endif
info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ *
std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u);
//! \note Force max single allocation size.
//! 4GB limit for the blit kernels and 64 bit optimizations.
info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_,
static_cast<cl_ulong>(settings().maxAllocSize_));
}
else {
uint maxHeapSize = flagIsDefault(GPU_MAX_HEAP_SIZE) ? 50 : GPU_MAX_HEAP_SIZE;
info_.globalMemSize_ = (std::min(maxHeapSize, 100u)
* calAttr.localRAM / 100u) * Mi;
uint maxAllocSize = flagIsDefault(GPU_SINGLE_ALLOC_PERCENT) ? 25 : GPU_SINGLE_ALLOC_PERCENT;
info_.maxMemAllocSize_ = cl_ulong(info_.globalMemSize_ *
std::min(maxAllocSize, 100u) / 100u);
}
if (info_.maxMemAllocSize_ < cl_ulong(128 * Mi)) {
LogError("We are unable to get a heap large enough to support the OpenCL minimum "\
"requirement for FULL_PROFILE");
}
info_.maxMemAllocSize_ = std::max(cl_ulong(128 * Mi), info_.maxMemAllocSize_);
// Clamp max single alloc size to the globalMemSize since it's
// reduced by default
info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, info_.globalMemSize_);
// We need to verify that we are not reporting more global memory
// that 4x single alloc
info_.globalMemSize_ = std::min( 4 * info_.maxMemAllocSize_, info_.globalMemSize_);
// Use 64 bit pointers
if (settings().use64BitPtr_) {
info_.addressBits_ = 64;
}
else {
info_.addressBits_ = 32;
// Limit total size with 3GB for 32 bit
info_.globalMemSize_ = std::min(info_.globalMemSize_, cl_ulong(3 * Gi));
}
// Alignment in BITS of the base address of any allocated memory object
static const size_t MemBaseAlignment = 256;
//! @note Force 256 bytes alignment, since currently
//! calAttr.surface_alignment returns 4KB. For pinned memory runtime
//! should be able to create a view with 256 bytes alignement
info_.memBaseAddrAlign_ = 8 * MemBaseAlignment;
info_.maxConstantBufferSize_ = 64 * Ki;
info_.maxConstantArgs_ = MaxConstArguments;
// Image support fields
if (settings().imageSupport_) {
info_.imageSupport_ = CL_TRUE;
info_.maxSamplers_ = MaxSamplers;
info_.maxReadImageArgs_ = MaxReadImage;
info_.maxWriteImageArgs_ = MaxWriteImage;
info_.image2DMaxWidth_ = maxTextureSize;
info_.image2DMaxHeight_ = maxTextureSize;
info_.image3DMaxWidth_ = std::min(2 * Ki, maxTextureSize);
info_.image3DMaxHeight_ = std::min(2 * Ki, maxTextureSize);
info_.image3DMaxDepth_ = std::min(2 * Ki, maxTextureSize);
info_.imagePitchAlignment_ = 256; // XXX: 256 pixel pitch alignment for now
info_.imageBaseAddressAlignment_ = 256; // XXX: 256 byte base address alignment for now
info_.bufferFromImageSupport_ = (isVirtualMode) ? CL_TRUE : CL_FALSE;
}
info_.errorCorrectionSupport_ = CL_FALSE;
if (settings().apuSystem_) {
info_.hostUnifiedMemory_ = CL_TRUE;
}
info_.profilingTimerResolution_ = 1;
info_.profilingTimerOffset_ = amd::Os::offsetToEpochNanos();
info_.littleEndian_ = CL_TRUE;
info_.available_ = CL_TRUE;
info_.compilerAvailable_ = CL_TRUE;
info_.linkerAvailable_ = CL_TRUE;
info_.executionCapabilities_ = CL_EXEC_KERNEL;
info_.preferredPlatformAtomicAlignment_ = 0;
info_.preferredGlobalAtomicAlignment_ = 0;
info_.preferredLocalAtomicAlignment_ = 0;
info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE;
info_.platform_ = AMD_PLATFORM;
#if cl_amd_open_video
// Open Video support
// Decoder
info_.openVideo_ = settings().openVideo_;
info_.maxVideoSessions_ = calVideoAttr.max_decode_sessions;
info_.numVideoAttribs_ = (calVideoAttr.data_size - 2 * sizeof(CALuint))
/ sizeof(CALvideoAttrib);
info_.videoAttribs_ = const_cast<cl_video_attrib_amd*>(
reinterpret_cast<const cl_video_attrib_amd*>(calVideoAttr.video_attribs));
// Encoder
info_.numVideoEncAttribs_ = (calVideoAttr.data_size - 2 * sizeof(CALuint))
/ sizeof(CALvideoEncAttrib);
info_.videoEncAttribs_ = const_cast<cl_video_attrib_encode_amd*>(
reinterpret_cast<const cl_video_attrib_encode_amd*>(calVideoAttr.video_enc_attribs));
#endif // cl_amd_open_video
::strcpy(info_.name_, hwInfo()->targetName_);
::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1,
AMD_BUILD_STRING "%s", (isVirtualMode) ? " (VM)": "");
info_.profile_ = "FULL_PROFILE";
if (settings().oclVersion_ == OpenCL20) {
info_.version_ = "OpenCL 2.0 " AMD_PLATFORM_INFO;
info_.oclcVersion_ = "OpenCL C 2.0 ";
info_.spirVersions_ = "1.2";
}
else if (settings().oclVersion_ == OpenCL12) {
info_.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO;
info_.oclcVersion_ = "OpenCL C 1.2 ";
info_.spirVersions_ = "1.2";
}
else {
info_.version_ = "OpenCL 1.0 " AMD_PLATFORM_INFO;
info_.oclcVersion_ = "OpenCL C 1.0 ";
info_.spirVersions_ = "";
LogError("Unknown version for support");
}
// Fill workgroup info size
info_.maxWorkGroupSize_ = settings().maxWorkGroupSize_;
info_.maxWorkItemSizes_[0] = info_.maxWorkGroupSize_;
info_.maxWorkItemSizes_[1] = info_.maxWorkGroupSize_;
info_.maxWorkItemSizes_[2] = info_.maxWorkGroupSize_;
if (settings().hwLDSSize_ != 0) {
info_.localMemType_ = CL_LOCAL;
info_.localMemSize_ = settings().hwLDSSize_;
}
else {
info_.localMemType_ = CL_GLOBAL;
info_.localMemSize_ = 16 * Ki;
}
info_.extensions_ = getExtensionString();
if (settings().checkExtension(ClExtAtomicCounters32)) {
info_.maxAtomicCounters_ = MaxAtomicCounters;
}
info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
info_.deviceTopology_.pcie.bus = (calAttr.pciTopologyInformation&(0xFF<<8))>>8;
info_.deviceTopology_.pcie.device = (calAttr.pciTopologyInformation&(0x1F<<3))>>3;
info_.deviceTopology_.pcie.function = (calAttr.pciTopologyInformation&0x07);
::strncpy(info_.boardName_, calAttr.boardName, sizeof(info_.boardName_));
// OpenCL1.2 device info fields
info_.builtInKernels_ = "";
info_.imageMaxBufferSize_ = MaxImageBufferSize;
info_.imageMaxArraySize_ = MaxImageArraySize;
info_.preferredInteropUserSync_ = true;
info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_;
if (settings().oclVersion_ >= OpenCL20) {
info_.svmCapabilities_ =
(CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER);
if (settings().svmAtomics_) {
info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS;
}
if (settings().svmFineGrainSystem_) {
info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM;
}
// OpenCL2.0 device info fields
info_.maxWriteImageArgs_ = MaxReadWriteImage; //!< For compatibility
info_.maxReadWriteImageArgs_ = MaxReadWriteImage;
info_.maxPipePacketSize_ = info_.maxMemAllocSize_;
info_.maxPipeActiveReservations_ = 16;
info_.maxPipeArgs_ = 16;
info_.queueOnDeviceProperties_ =
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
info_.queueOnDevicePreferredSize_ = 256 * Ki;
info_.queueOnDeviceMaxSize_ = 512 * Ki;
info_.maxOnDeviceQueues_ = 1;
info_.maxOnDeviceEvents_ = settings().numDeviceEvents_;
info_.globalVariablePreferredTotalSize_ = static_cast<size_t>(info_.globalMemSize_);
//! \todo Remove % calculation.
//! Use 90% of max single alloc size.
//! Boards with max single alloc size around 4GB will fail allocations
info_.maxGlobalVariableSize_ = static_cast<size_t>(
amd::alignDown(info_.maxMemAllocSize_ * 9 / 10, 256));
}
if (settings().checkExtension(ClAmdDeviceAttributeQuery)) {
info_.simdPerCU_ = hwInfo()->simdPerCU_;
info_.simdWidth_ = hwInfo()->simdWidth_;
info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
info_.wavefrontWidth_ = calAttr.wavefrontSize;
info_.globalMemChannels_ = calAttr.memBusWidth / 32;
info_.globalMemChannelBanks_ = calAttr.numMemBanks;
info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_;
info_.localMemSizePerCU_ = hwInfo()->localMemSizePerCU_;
info_.localMemBanks_ = hwInfo()->localMemBanks_;
info_.gfxipVersion_ = hwInfo()->gfxipVersion_;
info_.numAsyncQueues_ = numComputeRings;
info_.threadTraceEnable_ = settings().threadTraceEnable_;
}
}
void
Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings)
{
@@ -486,321 +794,6 @@ Device::~Device()
close();
}
void Device::fillDeviceInfo(
const CALdeviceattribs& calAttr,
const CALdevicestatus& calStatus
#if cl_amd_open_video
,
const CALdeviceVideoAttribs& calVideoAttr
#endif // cl_amd_open_video
)
{
info_.type_ = CL_DEVICE_TYPE_GPU;
info_.vendorId_ = 0x1002;
info_.maxComputeUnits_ = calAttr.numberOfSIMD;
info_.maxWorkItemDimensions_ = 3;
info_.numberOfShaderEngines = calAttr.numberOfShaderEngines;
if (settings().siPlus_) {
// SI parts are scalar. Also, reads don't need to be 128-bits to get peak rates.
// For example, float4 is not faster than float as long as all threads fetch the same
// amount of data and the reads are coalesced. This is from the H/W team and confirmed
// through experimentation. May also be true on EG/NI, but no point in confusing
// developers now.
info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4;
info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2;
info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1;
info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1;
info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1;
info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ =
(settings().checkExtension(ClKhrFp64)) ? 1 : 0;
info_.nativeVectorWidthHalf_ = info_.preferredVectorWidthHalf_ = 0; // no half support
}
else {
info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 16;
info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 8;
info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 4;
info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 2;
info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 4;
info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ =
(settings().checkExtension(ClKhrFp64)) ? 2 : 0;
info_.nativeVectorWidthHalf_ = info_.preferredVectorWidthHalf_ = 0; // no half support
}
info_.maxClockFrequency_ = (calAttr.engineClock != 0) ? calAttr.engineClock : 555;
info_.maxParameterSize_ = 1024;
info_.minDataTypeAlignSize_ = sizeof(cl_long16);
info_.singleFPConfig_ = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
| CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA;
if (GPU_FORCE_SINGLE_FP_DENORM) {
info_.singleFPConfig_ |= CL_FP_DENORM;
}
if (settings().checkExtension(ClKhrFp64)) {
info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM;
}
if (settings().reportFMA_) {
info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
}
info_.globalMemCacheLineSize_ = settings().cacheLineSize_;
info_.globalMemCacheSize_ = settings().cacheSize_;
if ((settings().cacheLineSize_ != 0) || (settings().cacheSize_ != 0)) {
info_.globalMemCacheType_ = CL_READ_WRITE_CACHE;
}
else {
info_.globalMemCacheType_ = CL_NONE;
}
if (heap()->isVirtual()) {
#if defined(ATI_OS_LINUX)
info_.globalMemSize_ =
(static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
// globalMemSize is the actual available size for app on Linux
// Because Linux base driver doesn't support paging
static_cast<cl_ulong>(calStatus.availVisibleHeap +
calStatus.availInvisibleHeap) / 100u) * Mi;
#else
info_.globalMemSize_ =
(static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
static_cast<cl_ulong>(calAttr.localRAM) / 100u) * Mi;
#endif
if (settings().apuSystem_) {
info_.globalMemSize_ +=
(static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi);
}
// We try to calculate the largest available memory size from
// the largest available block in either heap. In theory this
// should be the size we can actually allocate at application
// start. Note that it may not be a guarantee still as the
// application progresses.
info_.maxMemAllocSize_ = std::max(
cl_ulong(calStatus.largestBlockVisibleHeap * Mi),
cl_ulong(calStatus.largestBlockInvisibleHeap * Mi));
#if defined(ATI_OS_WIN)
if (settings().apuSystem_) {
info_.maxMemAllocSize_ = std::max(
(static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi),
info_.maxMemAllocSize_);
}
#endif
info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ *
std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u);
//! \note Force max single allocation size.
//! 4GB limit for the blit kernels and 64 bit optimizations.
info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_,
static_cast<cl_ulong>(settings().maxAllocSize_));
}
else {
uint maxHeapSize = flagIsDefault(GPU_MAX_HEAP_SIZE) ? 50 : GPU_MAX_HEAP_SIZE;
info_.globalMemSize_ = (std::min(maxHeapSize, 100u)
* calAttr.localRAM / 100u) * Mi;
uint maxAllocSize = flagIsDefault(GPU_SINGLE_ALLOC_PERCENT) ? 25 : GPU_SINGLE_ALLOC_PERCENT;
info_.maxMemAllocSize_ = cl_ulong(info_.globalMemSize_ *
std::min(maxAllocSize, 100u) / 100u);
}
if (info_.maxMemAllocSize_ < cl_ulong(128 * Mi)) {
LogError("We are unable to get a heap large enough to support the OpenCL minimum "\
"requirement for FULL_PROFILE");
}
info_.maxMemAllocSize_ = std::max(cl_ulong(128 * Mi), info_.maxMemAllocSize_);
// Clamp max single alloc size to the globalMemSize since it's
// reduced by default
info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, info_.globalMemSize_);
// We need to verify that we are not reporting more global memory
// that 4x single alloc
info_.globalMemSize_ = std::min( 4 * info_.maxMemAllocSize_, info_.globalMemSize_);
// Use 64 bit pointers
if (settings().use64BitPtr_) {
info_.addressBits_ = 64;
}
else {
info_.addressBits_ = 32;
// Limit total size with 3GB for 32 bit
info_.globalMemSize_ = std::min(info_.globalMemSize_, cl_ulong(3 * Gi));
}
// Alignment in BITS of the base address of any allocated memory object
static const size_t MemBaseAlignment = 256;
//! @note Force 256 bytes alignment, since currently
//! calAttr.surface_alignment returns 4KB. For pinned memory runtime
//! should be able to create a view with 256 bytes alignement
info_.memBaseAddrAlign_ = 8 * MemBaseAlignment;
info_.maxConstantBufferSize_ = 64 * Ki;
info_.maxConstantArgs_ = MaxConstArguments;
// Image support fields
if (settings().imageSupport_) {
info_.imageSupport_ = CL_TRUE;
info_.maxSamplers_ = MaxSamplers;
info_.maxReadImageArgs_ = MaxReadImage;
info_.maxWriteImageArgs_ = MaxWriteImage;
info_.image2DMaxWidth_ = static_cast<size_t>(getMaxTextureSize());
info_.image2DMaxHeight_ = static_cast<size_t>(getMaxTextureSize());
info_.image3DMaxWidth_ = std::min(2 * Ki, static_cast<size_t>(getMaxTextureSize()));
info_.image3DMaxHeight_ = std::min(2 * Ki, static_cast<size_t>(getMaxTextureSize()));
info_.image3DMaxDepth_ = std::min(2 * Ki, static_cast<size_t>(getMaxTextureSize()));
info_.imagePitchAlignment_ = 256; // XXX: 256 pixel pitch alignment for now
info_.imageBaseAddressAlignment_ = 256; // XXX: 256 byte base address alignment for now
info_.bufferFromImageSupport_ = (heap()->isVirtual()) ? CL_TRUE : CL_FALSE;
}
info_.errorCorrectionSupport_ = CL_FALSE;
if (settings().apuSystem_) {
info_.hostUnifiedMemory_ = CL_TRUE;
}
info_.profilingTimerResolution_ = 1;
info_.profilingTimerOffset_ = amd::Os::offsetToEpochNanos();
info_.littleEndian_ = CL_TRUE;
info_.available_ = CL_TRUE;
info_.compilerAvailable_ = CL_TRUE;
info_.linkerAvailable_ = CL_TRUE;
info_.executionCapabilities_ = CL_EXEC_KERNEL;
info_.preferredPlatformAtomicAlignment_ = 0;
info_.preferredGlobalAtomicAlignment_ = 0;
info_.preferredLocalAtomicAlignment_ = 0;
info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE;
info_.platform_ = AMD_PLATFORM;
#if cl_amd_open_video
// Open Video support
// Decoder
info_.openVideo_ = settings().openVideo_;
info_.maxVideoSessions_ = calVideoAttr.max_decode_sessions;
info_.numVideoAttribs_ = (calVideoAttr.data_size - 2 * sizeof(CALuint))
/ sizeof(CALvideoAttrib);
info_.videoAttribs_ = const_cast<cl_video_attrib_amd*>(
reinterpret_cast<const cl_video_attrib_amd*>(calVideoAttr.video_attribs));
// Encoder
info_.numVideoEncAttribs_ = (calVideoAttr.data_size - 2 * sizeof(CALuint))
/ sizeof(CALvideoEncAttrib);
info_.videoEncAttribs_ = const_cast<cl_video_attrib_encode_amd*>(
reinterpret_cast<const cl_video_attrib_encode_amd*>(calVideoAttr.video_enc_attribs));
#endif // cl_amd_open_video
::strcpy(info_.name_, hwInfo()->targetName_);
::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1,
AMD_BUILD_STRING "%s", (heap()->isVirtual()) ? " (VM)": "");
info_.profile_ = "FULL_PROFILE";
if (settings().oclVersion_ == OpenCL20) {
info_.version_ = "OpenCL 2.0 " AMD_PLATFORM_INFO;
info_.oclcVersion_ = "OpenCL C 2.0 ";
info_.spirVersions_ = "1.2";
}
else if (settings().oclVersion_ == OpenCL12) {
info_.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO;
info_.oclcVersion_ = "OpenCL C 1.2 ";
info_.spirVersions_ = "1.2";
}
else {
info_.version_ = "OpenCL 1.0 " AMD_PLATFORM_INFO;
info_.oclcVersion_ = "OpenCL C 1.0 ";
info_.spirVersions_ = "";
LogError("Unknown version for support");
}
// Fill workgroup info size
info_.maxWorkGroupSize_ = settings().maxWorkGroupSize_;
info_.maxWorkItemSizes_[0] = info_.maxWorkGroupSize_;
info_.maxWorkItemSizes_[1] = info_.maxWorkGroupSize_;
info_.maxWorkItemSizes_[2] = info_.maxWorkGroupSize_;
if (settings().hwLDSSize_ != 0) {
info_.localMemType_ = CL_LOCAL;
info_.localMemSize_ = settings().hwLDSSize_;
}
else {
info_.localMemType_ = CL_GLOBAL;
info_.localMemSize_ = 16 * Ki;
}
info_.extensions_ = getExtensionString();
if (settings().checkExtension(ClExtAtomicCounters32)) {
info_.maxAtomicCounters_ = MaxAtomicCounters;
}
info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
info_.deviceTopology_.pcie.bus = (calAttr.pciTopologyInformation&(0xFF<<8))>>8;
info_.deviceTopology_.pcie.device = (calAttr.pciTopologyInformation&(0x1F<<3))>>3;
info_.deviceTopology_.pcie.function = (calAttr.pciTopologyInformation&0x07);
::strncpy(info_.boardName_, calAttr.boardName, sizeof(info_.boardName_));
// OpenCL1.2 device info fields
info_.builtInKernels_ = "";
info_.imageMaxBufferSize_ = MaxImageBufferSize;
info_.imageMaxArraySize_ = MaxImageArraySize;
info_.preferredInteropUserSync_ = true;
info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_;
if (settings().oclVersion_ >= OpenCL20) {
info_.svmCapabilities_ =
(CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER);
if (settings().svmAtomics_) {
info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS;
}
if (settings().svmFineGrainSystem_) {
info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM;
}
// OpenCL2.0 device info fields
info_.maxWriteImageArgs_ = MaxReadWriteImage; //!< For compatibility
info_.maxReadWriteImageArgs_ = MaxReadWriteImage;
info_.maxPipePacketSize_ = info_.maxMemAllocSize_;
info_.maxPipeActiveReservations_ = 16;
info_.maxPipeArgs_ = 16;
info_.queueOnDeviceProperties_ =
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
info_.queueOnDevicePreferredSize_ = 256 * Ki;
info_.queueOnDeviceMaxSize_ = 512 * Ki;
info_.maxOnDeviceQueues_ = 1;
info_.maxOnDeviceEvents_ = settings().numDeviceEvents_;
info_.globalVariablePreferredTotalSize_ = static_cast<size_t>(info_.globalMemSize_);
//! \todo Remove % calculation.
//! Use 90% of max single alloc size.
//! Boards with max single alloc size around 4GB will fail allocations
info_.maxGlobalVariableSize_ = static_cast<size_t>(
amd::alignDown(info_.maxMemAllocSize_ * 9 / 10, 256));
}
if (settings().checkExtension(ClAmdDeviceAttributeQuery)) {
info_.simdPerCU_ = hwInfo()->simdPerCU_;
info_.simdWidth_ = hwInfo()->simdWidth_;
info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
info_.wavefrontWidth_ = calAttr.wavefrontSize;
info_.globalMemChannels_ = calAttr.memBusWidth / 32;
info_.globalMemChannelBanks_ = calAttr.numMemBanks;
info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_;
info_.localMemSizePerCU_ = hwInfo()->localMemSizePerCU_;
info_.localMemBanks_ = hwInfo()->localMemBanks_;
info_.gfxipVersion_ = hwInfo()->gfxipVersion_;
info_.numAsyncQueues_ = engines().numComputeRings();
info_.threadTraceEnable_ = settings().threadTraceEnable_;
}
}
extern const char* SchedulerSourceCode;
bool
@@ -930,7 +923,9 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
}
// Fill the device info structure
fillDeviceInfo(getAttribs(), getStatus()
fillDeviceInfo(getAttribs(), getStatus(),
static_cast<size_t>(getMaxTextureSize()),
engines().numComputeRings(), heap()->isVirtual()
#if cl_amd_open_video
, getVideoAttribs()
#endif //cl_amd_open_video
+13 -10
Ver ficheiro
@@ -124,6 +124,19 @@ public:
protected:
CALtarget calTarget_; //!< GPU device identifier
const AMDDeviceInfo* hwInfo_; //!< Device HW info structure
//! Fills OpenCL device info structure
void fillDeviceInfo(
const CALdeviceattribs& calAttr, //!< CAL device attributes info
const CALdevicestatus& calStatus, //!< CAL device status
size_t maxTextureSize, //!< Maximum texture size supported in HW
uint numComputeRings, //!< Number of compute rings
bool isVirtualMode //!< Device is in virtual mode
#if cl_amd_open_video
,
const CALdeviceVideoAttribs& calVideoAttr //!< -"- video attrib. info
#endif //cl_amd_open_video
);
};
//! Forward declarations
@@ -567,16 +580,6 @@ private:
//! Sends the stall command to all queues
bool stallQueues();
//! Fills OpenCL device info structure
void fillDeviceInfo(
const CALdeviceattribs& calAttr, //!< CAL device attributes info
const CALdevicestatus& calStatus //!< CAL device status
#if cl_amd_open_video
,
const CALdeviceVideoAttribs& calVideoAttr //!< -"- video attrib. info
#endif //cl_amd_open_video
);
//! Buffer allocation from static heap (no VM mode only)
gpu::Memory* createBufferFromHeap(
amd::Memory& owner //!< Abstraction layer memory object