a904ee09f9
EPR #406328 - made more changes so that the OpenCL can run 2.0 devices on the first 2.0 capable device and OpenCL 1.2 on the rest of devices if they can suppport. Also uncommented an assert accidently commted by the previous check in.
this is the OpenCL part of checkin.
Prechecking:
http://ocltc.amd.com:8111/viewModification.html?modId=40944&personal=true&buildTypeId=&tab=vcsModificationBuilds&show_all_builds=true
code review:
http://ocltc.amd.com/reviews/r/5987/
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_svm.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#288 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/context.cpp#31 edit
[ROCm/clr commit: 4390727621]
502 righe
15 KiB
C++
502 righe
15 KiB
C++
//
|
|
// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#include "top.hpp"
|
|
#include "os/os.hpp"
|
|
#include "device/device.hpp"
|
|
#include "device/gpu/gpudefs.hpp"
|
|
#include "device/gpu/gpusettings.hpp"
|
|
|
|
#include <algorithm>
|
|
|
|
namespace gpu {
|
|
|
|
Settings::Settings()
|
|
{
|
|
// Initialize the GPU device default settings
|
|
oclVersion_ = OpenCL12;
|
|
debugFlags_ = 0;
|
|
singleHeap_ = false;
|
|
syncObject_ = GPU_USE_SYNC_OBJECTS;
|
|
remoteAlloc_ = REMOTE_ALLOC;
|
|
|
|
stagedXferRead_ = true;
|
|
stagedXferWrite_ = true;
|
|
stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki;
|
|
|
|
// We will enable staged read/write if we use local memory
|
|
disablePersistent_ = false;
|
|
|
|
// By Default persistent writes will be disabled.
|
|
stagingWritePersistent_ = GPU_STAGING_WRITE_PERSISTENT;
|
|
|
|
maxRenames_ = 16;
|
|
maxRenameSize_ = 4 * Mi;
|
|
|
|
// The global heap settings
|
|
heapSize_ = GPU_INITIAL_HEAP_SIZE * Mi;
|
|
heapSizeGrowth_ = GPU_HEAP_GROWTH_INCREMENT * Mi;
|
|
|
|
useAliases_ = false;
|
|
imageSupport_ = false;
|
|
hwLDSSize_ = 0;
|
|
|
|
// Set this to true when we drop the flag
|
|
doublePrecision_ = ::CL_KHR_FP64;
|
|
|
|
// Fill workgroup info size
|
|
// @todo: revisit the 256 limitation on workgroup size
|
|
maxWorkGroupSize_ = 256;
|
|
|
|
hostMemDirectAccess_ = HostMemDisable;
|
|
|
|
libSelector_ = amd::LibraryUndefined;
|
|
|
|
#if cl_amd_open_video
|
|
// By default Open Video extension is not yet supported
|
|
openVideo_ = false;
|
|
#endif // cl_amd_open_video
|
|
|
|
// Enable workload split by default (for 24 bit arithmetic or timeout)
|
|
workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT;
|
|
|
|
// By default use host blit
|
|
blitEngine_ = BlitEngineHost;
|
|
const static size_t MaxPinnedXferSize = 32;
|
|
pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi;
|
|
pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
|
|
|
|
// Disable FP_FAST_FMA defines by default
|
|
reportFMAF_ = false;
|
|
reportFMA_ = false;
|
|
|
|
// Disable async memory transfers by default
|
|
asyncMemCopy_ = false;
|
|
|
|
// GPU device by default
|
|
apuSystem_ = false;
|
|
|
|
// Disable 64 bit pointers support by default
|
|
use64BitPtr_ = false;
|
|
|
|
// Max alloc size is 16GB
|
|
maxAllocSize_ = 16 * static_cast<uint64_t>(Gi);
|
|
|
|
// Disable memory dependency tracking by default
|
|
numMemDependencies_ = 0;
|
|
|
|
// By default cache isn't present
|
|
cacheLineSize_ = 0;
|
|
cacheSize_ = 0;
|
|
|
|
// Initialize transfer buffer size to 1MB by default
|
|
xferBufSize_ = 1024 * Ki;
|
|
|
|
// Use image DMA if requested
|
|
imageDMA_ = GPU_IMAGE_DMA;
|
|
|
|
// Disable ASIC specific features by default
|
|
siPlus_ = false;
|
|
ciPlus_ = false;
|
|
viPlus_ = false;
|
|
|
|
// Number of compute rings.
|
|
numComputeRings_ = 0;
|
|
|
|
// Rectangular Linear DRMDMA
|
|
rectLinearDMA_ = false;
|
|
|
|
minWorkloadTime_ = 1; // 0.1 ms
|
|
maxWorkloadTime_ = 5000; // 500 ms
|
|
|
|
// Preallocates address space
|
|
preallocAddrSpace_ = GPU_PREALLOC_ADDR_SPACE;
|
|
|
|
// Controls tiled images in persistent
|
|
//!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
|
|
linearPersistentImage_ = false;
|
|
|
|
useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
|
|
|
|
// SDMA profiling is disabled by default
|
|
sdmaProfiling_ = false;
|
|
|
|
// Device enqueuing settings
|
|
numDeviceEvents_ = 1024;
|
|
numWaitEvents_ = 8;
|
|
|
|
// Disable HSAIL by default
|
|
hsail_ = false;
|
|
|
|
// Don't support platform atomics by default.
|
|
svmAtomics_ = false;
|
|
|
|
// Use direct SRD by default
|
|
hsailDirectSRD_ = GPU_DIRECT_SRD;
|
|
|
|
// Use host queue for device enqueuing by default
|
|
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
|
|
}
|
|
|
|
bool
|
|
Settings::create(
|
|
const CALdeviceattribs& calAttr
|
|
#if cl_amd_open_video
|
|
, const CALdeviceVideoAttribs& calVideoAttr
|
|
#endif // cl_amd_open_video
|
|
, bool reportAsOCL12Device
|
|
)
|
|
{
|
|
CALuint target = calAttr.target;
|
|
uint32_t osVer = 0x0;
|
|
|
|
// Disable thread trace by default for all devices
|
|
threadTraceEnable_ = false;
|
|
|
|
// Save resource cache size
|
|
#ifdef ATI_OS_LINUX
|
|
// Due to EPR#406216, set the default value for Linux for now
|
|
resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi;
|
|
#else
|
|
resourceCacheSize_ = std::max((calAttr.localRAM / 8) * Mi, GPU_RESOURCE_CACHE_SIZE * Mi);
|
|
#endif
|
|
|
|
if (calAttr.doublePrecision) {
|
|
// Report FP_FAST_FMA define if double precision HW
|
|
reportFMA_ = true;
|
|
// FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
|
|
// Bonaire, Kalindi, Spectre and Spooky so disable
|
|
// FP_FMA_FMAF for those parts in switch below
|
|
reportFMAF_ = true;
|
|
}
|
|
|
|
// Update GPU specific settings and info structure if we have any
|
|
switch (target) {
|
|
case CAL_TARGET_SUMO:
|
|
case CAL_TARGET_SUPERSUMO:
|
|
case CAL_TARGET_WRESTLER:
|
|
// Treat these like Evergreen parts as far as capabilities go
|
|
// Fall through ...
|
|
case CAL_TARGET_DEVASTATOR:
|
|
case CAL_TARGET_SCRAPPER:
|
|
apuSystem_ = true;
|
|
reportFMAF_ = false;
|
|
// For the system that has APU and Win 8, the work load needs to be smaller
|
|
// This is because KMD doesn't have workaround for TDR in Win 8
|
|
// This is needed only for EG/NI because EG/NI is using graphics ring
|
|
#if defined(_WIN32)
|
|
{
|
|
OSVERSIONINFOEX versionInfo = { 0 };
|
|
versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
|
|
versionInfo.dwMajorVersion = 6;
|
|
versionInfo.dwMinorVersion = 2;
|
|
|
|
DWORDLONG conditionMask = 0;
|
|
VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL);
|
|
VER_SET_CONDITION(conditionMask, VER_MINORVERSION, VER_GREATER_EQUAL);
|
|
if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
|
|
maxWorkloadTime_ = 500; // 50 ms
|
|
}
|
|
}
|
|
#endif // defined(_WIN32)
|
|
// Add the caps for Trinity here ...
|
|
// Fall through ...
|
|
case CAL_TARGET_CAYMAN:
|
|
// Add the caps for Cayman here ...
|
|
case CAL_TARGET_KAUAI:
|
|
case CAL_TARGET_BARTS:
|
|
case CAL_TARGET_TURKS:
|
|
case CAL_TARGET_CAICOS:
|
|
// Treat these like Evergreen parts as far as capabilities go
|
|
// Fall through ...
|
|
case CAL_TARGET_CYPRESS:
|
|
case CAL_TARGET_JUNIPER:
|
|
case CAL_TARGET_REDWOOD:
|
|
case CAL_TARGET_CEDAR:
|
|
// UAV arena is a pre-SI specific HW feature
|
|
useAliases_ = true;
|
|
|
|
if (CAL_TARGET_CEDAR == target) {
|
|
// Workaround for SC spill bugs.
|
|
maxWorkGroupSize_ = 128;
|
|
}
|
|
// Get the link library
|
|
libSelector_ = amd::GPU_Library_Evergreen;
|
|
|
|
// Max alloc size
|
|
maxAllocSize_ = 512 * Mi;
|
|
|
|
if ((target == CAL_TARGET_CAYMAN) ||
|
|
(target == CAL_TARGET_DEVASTATOR) ||
|
|
(target == CAL_TARGET_SCRAPPER)) {
|
|
rectLinearDMA_ = true;
|
|
}
|
|
|
|
// Disable KHR_FP64 for Trinity in the mainline
|
|
if ((target == CAL_TARGET_DEVASTATOR) ||
|
|
(target == CAL_TARGET_SCRAPPER)) {
|
|
doublePrecision_ &= !IS_MAINLINE || !flagIsDefault(CL_KHR_FP64);
|
|
}
|
|
|
|
if (target == CAL_TARGET_CYPRESS) {
|
|
// Float FMA is slower than "multiply + add" because we combine
|
|
// "multiply + add" into mad. MAD is 25% faster than FMA on Cypress,
|
|
// assuming perfect VLIW packing.
|
|
reportFMAF_ = false;
|
|
}
|
|
enableExtension(ClAmdImage2dFromBufferReadOnly);
|
|
break;
|
|
case CAL_TARGET_CARRIZO:
|
|
apuSystem_ = true;
|
|
case CAL_TARGET_ICELAND:
|
|
case CAL_TARGET_TONGA:
|
|
case CAL_TARGET_BERMUDA:
|
|
case CAL_TARGET_FIJI:
|
|
// Disable tiling aperture on VI+
|
|
linearPersistentImage_ = true;
|
|
viPlus_ = true;
|
|
// Fall through to CI ...
|
|
case CAL_TARGET_KALINDI:
|
|
case CAL_TARGET_SPECTRE:
|
|
case CAL_TARGET_SPOOKY:
|
|
case CAL_TARGET_GODAVARI:
|
|
if (!viPlus_) {
|
|
// APU systems for CI
|
|
apuSystem_ = true;
|
|
}
|
|
// Fall through ...
|
|
case CAL_TARGET_BONAIRE:
|
|
case CAL_TARGET_HAWAII:
|
|
ciPlus_ = true;
|
|
sdmaProfiling_ = true;
|
|
hsail_ = GPU_HSAIL_ENABLE;
|
|
// Fall through to SI ...
|
|
case CAL_TARGET_PITCAIRN:
|
|
case CAL_TARGET_CAPEVERDE:
|
|
case CAL_TARGET_OLAND:
|
|
case CAL_TARGET_HAINAN:
|
|
reportFMAF_ = false;
|
|
if (target == CAL_TARGET_HAWAII) {
|
|
reportFMAF_ = true;
|
|
}
|
|
// Fall through ...
|
|
case CAL_TARGET_TAHITI:
|
|
siPlus_ = true;
|
|
// Cache line size is 64 bytes
|
|
cacheLineSize_ = 64;
|
|
// L1 cache size is 16KB
|
|
cacheSize_ = 16 * Ki;
|
|
|
|
if (ciPlus_) {
|
|
libSelector_ = amd::GPU_Library_CI;
|
|
#if defined(_LP64)
|
|
oclVersion_ = !reportAsOCL12Device && calAttr.isOpenCL200Device ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12;
|
|
#endif
|
|
if (GPU_FORCE_OCL20_32BIT) {
|
|
force32BitOcl20_ = true;
|
|
oclVersion_ = !reportAsOCL12Device && calAttr.isOpenCL200Device ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12;
|
|
}
|
|
if (hsail_ || (OPENCL_VERSION < 200)) {
|
|
oclVersion_ = OpenCL12;
|
|
}
|
|
numComputeRings_ = 8;
|
|
}
|
|
else {
|
|
numComputeRings_ = 2;
|
|
libSelector_ = amd::GPU_Library_SI;
|
|
}
|
|
|
|
// This needs to be cleaned once 64bit addressing is stable
|
|
if (oclVersion_ < OpenCL20) {
|
|
use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) ? LP64_SWITCH(false,
|
|
calAttr.isWorkstation || hsail_) : GPU_FORCE_64BIT_PTR;
|
|
}
|
|
else {
|
|
if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
|
|
|| (oclVersion_ >= OpenCL20)))) {
|
|
use64BitPtr_ = true;
|
|
}
|
|
}
|
|
|
|
if (oclVersion_ >= OpenCL20) {
|
|
supportDepthsRGB_ = true;
|
|
customSvmAllocator_ = true;
|
|
}
|
|
if (use64BitPtr_) {
|
|
if (GPU_ENABLE_LARGE_ALLOCATION) {
|
|
maxAllocSize_ = 16ULL * Gi;
|
|
}
|
|
else {
|
|
maxAllocSize_ = 4048 * Mi;
|
|
}
|
|
}
|
|
else {
|
|
maxAllocSize_ = 3ULL * Gi;
|
|
}
|
|
|
|
supportRA_ = false;
|
|
partialDispatch_ = GPU_PARTIAL_DISPATCH;
|
|
numMemDependencies_ = GPU_NUM_MEM_DEPENDENCY;
|
|
|
|
//! @todo HSAIL doesn't support 64 bit atomic on 32 bit!
|
|
if (LP64_SWITCH(!hsail_, true)) {
|
|
enableExtension(ClKhrInt64BaseAtomics);
|
|
enableExtension(ClKhrInt64ExtendedAtomics);
|
|
}
|
|
enableExtension(ClKhrImage2dFromBuffer);
|
|
|
|
rectLinearDMA_ = true;
|
|
|
|
if (AMD_DEPTH_MSAA_INTEROP) {
|
|
enableExtension(ClKhrGLDepthImages);
|
|
depthMSAAInterop_ = true;
|
|
}
|
|
if (AMD_THREAD_TRACE_ENABLE) {
|
|
threadTraceEnable_ = true;
|
|
}
|
|
// Disable non-aliased(multiUAV) optimization
|
|
assumeAliases_ = true;
|
|
break;
|
|
default:
|
|
assert(0 && "Unknown ASIC type!");
|
|
return false;
|
|
}
|
|
|
|
// Enable atomics support
|
|
enableExtension(ClKhrGlobalInt32BaseAtomics);
|
|
enableExtension(ClKhrGlobalInt32ExtendedAtomics);
|
|
enableExtension(ClKhrLocalInt32BaseAtomics);
|
|
enableExtension(ClKhrLocalInt32ExtendedAtomics);
|
|
enableExtension(ClKhrByteAddressableStore);
|
|
enableExtension(ClKhrGlSharing);
|
|
enableExtension(ClKhrGlEvent);
|
|
enableExtension(ClAmdMediaOps);
|
|
enableExtension(ClAmdMediaOps2);
|
|
enableExtension(ClAmdPopcnt);
|
|
#if defined(_WIN32)
|
|
enableExtension(ClKhrD3d9Sharing);
|
|
enableExtension(ClKhrD3d10Sharing);
|
|
enableExtension(ClKhrD3d11Sharing);
|
|
#endif // _WIN32
|
|
enableExtension(ClKhr3DImageWrites);
|
|
enableExtension(ClAmdVec3);
|
|
enableExtension(ClAmdPrintf);
|
|
enableExtension(ClExtAtomicCounters32);
|
|
|
|
hwLDSSize_ = 32 * Ki;
|
|
|
|
imageSupport_ = true;
|
|
singleHeap_ = true;
|
|
|
|
// Use kernels for blit if appropriate
|
|
blitEngine_ = BlitEngineKernel;
|
|
|
|
#if cl_amd_open_video
|
|
// Enable OpenVideo by default if CAL supports it
|
|
if (calVideoAttr.max_decode_sessions > 0) {
|
|
openVideo_ = true;
|
|
if (GPU_OPEN_VIDEO)
|
|
enableExtension(ClAmdOpenVideo);
|
|
}
|
|
#endif // cl_amd_open_video
|
|
|
|
hostMemDirectAccess_ |= HostMemBuffer;
|
|
// HW doesn't support untiled image writes
|
|
// hostMemDirectAccess_ |= HostMemImage;
|
|
|
|
asyncMemCopy_ = true;
|
|
|
|
// Make sure device actually supports double precision
|
|
doublePrecision_ = (calAttr.doublePrecision) ? doublePrecision_ : false;
|
|
if (doublePrecision_) {
|
|
// Enable KHR double precision extension
|
|
enableExtension(ClKhrFp64);
|
|
}
|
|
|
|
if (calAttr.doublePrecision) {
|
|
// Enable AMD double precision extension
|
|
doublePrecision_ = true;
|
|
enableExtension(ClAmdFp64);
|
|
}
|
|
|
|
if (calAttr.totalSDIHeap > 0) {
|
|
//Enable bus addressable memory extension
|
|
enableExtension(ClAMDBusAddressableMemory);
|
|
}
|
|
|
|
if (calAttr.longIdleDetect) {
|
|
// KMD is unable to detect if we map the visible memory for CPU access, so
|
|
// accessing persistent staged buffer may fail if LongIdleDetct is enabled.
|
|
disablePersistent_ = true;
|
|
}
|
|
|
|
if (calAttr.priSupport) {
|
|
svmAtomics_ = true;
|
|
}
|
|
|
|
// Enable some platform extensions
|
|
enableExtension(ClAmdDeviceAttributeQuery);
|
|
|
|
enableExtension(ClKhrSpir);
|
|
|
|
// Enable some OpenCL 2.0 extensions
|
|
if (oclVersion_ >= OpenCL20) {
|
|
enableExtension(ClKhrSubGroups);
|
|
enableExtension(ClKhrDepthImages);
|
|
}
|
|
|
|
// Override current device settings
|
|
override();
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
Settings::override()
|
|
{
|
|
// Limit reported workgroup size
|
|
if (GPU_MAX_WORKGROUP_SIZE != 0) {
|
|
maxWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
|
|
}
|
|
|
|
// Override blit engine type
|
|
if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) {
|
|
blitEngine_ = GPU_BLIT_ENGINE_TYPE;
|
|
}
|
|
|
|
if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
|
|
debugFlags_ = DEBUG_GPU_FLAGS;
|
|
}
|
|
|
|
// Check async memory transfer
|
|
if (!flagIsDefault(GPU_ASYNC_MEM_COPY)) {
|
|
asyncMemCopy_ = GPU_ASYNC_MEM_COPY;
|
|
}
|
|
|
|
if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
|
|
debugFlags_ = DEBUG_GPU_FLAGS;
|
|
}
|
|
|
|
if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
|
|
xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
|
|
}
|
|
|
|
if (!flagIsDefault(GPU_USE_SYNC_OBJECTS)) {
|
|
syncObject_ = GPU_USE_SYNC_OBJECTS;
|
|
}
|
|
|
|
if (!flagIsDefault(GPU_NUM_COMPUTE_RINGS)) {
|
|
numComputeRings_ = GPU_NUM_COMPUTE_RINGS;
|
|
}
|
|
|
|
if (!flagIsDefault(GPU_ASSUME_ALIASES)) {
|
|
assumeAliases_ = GPU_ASSUME_ALIASES;
|
|
}
|
|
if (!flagIsDefault(GPU_RESOURCE_CACHE_SIZE)) {
|
|
resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi;
|
|
}
|
|
}
|
|
|
|
} // namespace gpu
|