Files
rocm-systems/rocclr/runtime/device/pal/palsettings.cpp
T
foreman 20447c31c9 P4 to Git Change 1761135 by asalmanp@asalmanp-ocl-stg on 2019/03/25 14:44:32
SWDEV-132899 - [OCL][GFX10] add support for Navi10_A0 (gfx1010) (new entry from PAL for Navi10 A0 boards)

	ReviewBoardURL = http://ocltc.amd.com/reviews/r/17022/

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#50 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#74 edit
2019-03-25 22:33:07 -04:00

526 строки
15 KiB
C++

//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "top.hpp"
#include "os/os.hpp"
#include "device/device.hpp"
#include "device/pal/paldefs.hpp"
#include "device/pal/palsettings.hpp"
#include <algorithm>
#if defined(_WIN32)
#include "Windows.h"
#include "VersionHelpers.h"
#endif
namespace pal {
/*! \brief information for adjusting maximum workload time
*
* This structure contains the time and OS minor version for max workload time
* adjustment for Windows 7 or 8.
*/
struct ModifyMaxWorkload {
uint32_t time; //!< max work load time (10x ms)
uint32_t minorVersion; //!< OS minor version
#if defined(_WIN32)
BYTE comparisonOps; //!< Comparison option
#endif
};
Settings::Settings() {
// Initialize the GPU device default settings
oclVersion_ = OpenCL12;
debugFlags_ = 0;
remoteAlloc_ = REMOTE_ALLOC;
stagedXferRead_ = true;
stagedXferWrite_ = true;
stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki;
// We will enable staged read/write if we use local memory
disablePersistent_ = false;
imageSupport_ = false;
hwLDSSize_ = 0;
// Set this to true when we drop the flag
doublePrecision_ = ::CL_KHR_FP64;
// Fill workgroup info size
maxWorkGroupSize_ = 1024;
preferredWorkGroupSize_ = 256;
hostMemDirectAccess_ = HostMemDisable;
libSelector_ = amd::LibraryUndefined;
// Enable workload split by default (for 24 bit arithmetic or timeout)
workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT;
// By default use host blit
blitEngine_ = BlitEngineHost;
const static size_t MaxPinnedXferSize = 32;
pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi;
pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
// Disable FP_FAST_FMA defines by default
reportFMAF_ = false;
reportFMA_ = false;
// GPU device by default
apuSystem_ = false;
// Disable 64 bit pointers support by default
use64BitPtr_ = false;
// Max alloc size is 16GB
maxAllocSize_ = 16 * static_cast<uint64_t>(Gi);
// Disable memory dependency tracking by default
numMemDependencies_ = 0;
// By default cache isn't present
cacheLineSize_ = 0;
cacheSize_ = 0;
// Initialize transfer buffer size to 1MB by default
xferBufSize_ = 1024 * Ki;
// Use image DMA if requested
imageDMA_ = GPU_IMAGE_DMA;
// Disable ASIC specific features by default
viPlus_ = false;
aiPlus_ = false;
gfx10Plus_ = false;
// Number of compute rings.
numComputeRings_ = 0;
minWorkloadTime_ = 1; // 0.1 ms
maxWorkloadTime_ = 500000; // 500 ms
// Controls tiled images in persistent
//!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
linearPersistentImage_ = false;
useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
// Device enqueuing settings
numDeviceEvents_ = 1024;
numWaitEvents_ = 8;
numScratchWavesPerCu_ = 16;
// Don't support platform atomics by default.
svmAtomics_ = false;
// Use host queue for device enqueuing by default
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
// Don't support Denormals for single precision by default
singleFpDenorm_ = false;
// Disable SDMA workaround by default
sdamPageFaultWar_ = false;
// SQTT buffer size in bytes
rgpSqttDispCount_ = PAL_RGP_DISP_COUNT;
rgpSqttWaitIdle_ = true;
rgpSqttForceDisable_ = false;
// Sub allocation parameters
subAllocationMinSize_ = 4 * Ki;
subAllocationChunkSize_ = 64 * Mi;
subAllocationMaxSize_ =
std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
maxCmdBuffers_ = 12;
useLightning_ = GPU_ENABLE_LC;
enableWgpMode_ = false;
enableWave32Mode_ = false;
hsailExplicitXnack_ = false;
lcWavefrontSize64_ = true;
}
bool Settings::create(const Pal::DeviceProperties& palProp,
const Pal::GpuMemoryHeapProperties* heaps, const Pal::WorkStationCaps& wscaps,
bool reportAsOCL12Device)
{
uint32_t osVer = 0x0;
// Disable thread trace by default for all devices
threadTraceEnable_ = false;
bool doublePrecision = true;
if (doublePrecision) {
// Report FP_FAST_FMA define if double precision HW
reportFMA_ = true;
// FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
// Bonaire, Kalindi, Spectre and Spooky so disable
// FP_FMA_FMAF for those parts in switch below
reportFMAF_ = true;
}
// Update GPU specific settings and info structure if we have any
ModifyMaxWorkload modifyMaxWorkload = {0};
// APU systems
if (palProp.gpuType == Pal::GpuType::Integrated) {
apuSystem_ = true;
}
switch (palProp.revision) {
case Pal::AsicRevision::Unknown:
switch (palProp.gfxLevel) {
case Pal::GfxIpLevel::GfxIp10_1:
case Pal::GfxIpLevel::GfxIp10:
gfx10Plus_ = true;
case Pal::GfxIpLevel::GfxIp9:
aiPlus_ = true;
break;
default:
assert(0 && "Unknown GfxIP type!");
return false;
}
case Pal::AsicRevision::Navi12:
case Pal::AsicRevision::Navi14:
case Pal::AsicRevision::Navi10:
case Pal::AsicRevision::Navi10_A0:
case Pal::AsicRevision::Navi10Lite:
gfx10Plus_ = true;
hsailExplicitXnack_ = static_cast<uint>(palProp.gpuMemoryProperties.flags.pageMigrationEnabled
|| palProp.gpuMemoryProperties.flags.iommuv2Support);
enableWgpMode_ = GPU_ENABLE_WGP_MODE;
if (useLightning_) {
enableWave32Mode_ = true;
}
if (palProp.revision == Pal::AsicRevision::Navi10Lite && useLightning_) {
enableWave32Mode_ = false;
}
if (!flagIsDefault(GPU_ENABLE_WAVE32_MODE)) {
enableWave32Mode_ = GPU_ENABLE_WAVE32_MODE;
}
lcWavefrontSize64_ = !enableWave32Mode_;
// Fall through to AI (gfx9) ...
case Pal::AsicRevision::Vega20:
case Pal::AsicRevision::Vega12:
case Pal::AsicRevision::Vega10:
case Pal::AsicRevision::Raven:
case Pal::AsicRevision::Raven2:
aiPlus_ = true;
// Fall through to VI ...
case Pal::AsicRevision::Carrizo:
case Pal::AsicRevision::Bristol:
case Pal::AsicRevision::Stoney:
if (!aiPlus_) {
// Fix BSOD/TDR issues observed on Stoney Win7/8.1/10
minWorkloadTime_ = 1000;
modifyMaxWorkload.time = 1000; // Decided by experiment
modifyMaxWorkload.minorVersion = 1; // Win 7
#if defined(_WIN32)
modifyMaxWorkload.comparisonOps = VER_EQUAL; // Limit to Win 7 only
#endif
}
case Pal::AsicRevision::Iceland:
case Pal::AsicRevision::Tonga:
case Pal::AsicRevision::Fiji:
case Pal::AsicRevision::Polaris10:
case Pal::AsicRevision::Polaris11:
case Pal::AsicRevision::Polaris12:
// Disable tiling aperture on VI+
linearPersistentImage_ = true;
// Keep this false even though we have support
// singleFpDenorm_ = true;
viPlus_ = true;
// SDMA may have memory access outside of
// the valid buffer range and cause a page fault
sdamPageFaultWar_ = true;
enableExtension(ClKhrFp16);
// Fall through to CI ...
case Pal::AsicRevision::Kalindi:
case Pal::AsicRevision::Godavari:
case Pal::AsicRevision::Spectre:
case Pal::AsicRevision::Spooky:
if (!viPlus_) {
// Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903)
modifyMaxWorkload.time = 250000; // 250ms
modifyMaxWorkload.minorVersion = 1; // Win 7
#if defined(_WIN32)
modifyMaxWorkload.comparisonOps = VER_EQUAL; // limit to Win 7
#endif
}
// Fall through ...
case Pal::AsicRevision::Bonaire:
case Pal::AsicRevision::Hawaii:
threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE;
reportFMAF_ = false;
if ((palProp.revision == Pal::AsicRevision::Hawaii) || aiPlus_) {
reportFMAF_ = true;
}
// Cache line size is 64 bytes
cacheLineSize_ = 64;
// L1 cache size is 16KB
cacheSize_ = 16 * Ki;
libSelector_ = amd::GPU_Library_CI;
if (LP64_SWITCH(false, true)) {
oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
: OpenCL12;
}
if (GPU_FORCE_OCL20_32BIT) {
force32BitOcl20_ = true;
oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
: OpenCL12;
}
if (OPENCL_VERSION < 200) {
oclVersion_ = OpenCL12;
}
numComputeRings_ = 8;
// Cap at OpenCL20 for now
if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20;
// This needs to be cleaned once 64bit addressing is stable
if (oclVersion_ < OpenCL20) {
use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR)
? LP64_SWITCH(false,
/*calAttr.isWorkstation ||*/ true)
: GPU_FORCE_64BIT_PTR;
} else {
if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) {
use64BitPtr_ = true;
}
}
if (oclVersion_ >= OpenCL20) {
supportDepthsRGB_ = true;
}
if (use64BitPtr_) {
if (amd::IS_HIP || (GPU_ENABLE_LARGE_ALLOCATION && wscaps.workStationBoard)) {
maxAllocSize_ = 64ULL * Gi;
} else {
maxAllocSize_ = 4048 * Mi;
}
} else {
maxAllocSize_ = 3ULL * Gi;
}
// Note: More than 4 command buffers may cause a HW hang
// with HWSC on pre-gfx9 devices in OCLPerfKernelArguments
if (!aiPlus_) {
maxCmdBuffers_ = 4;
}
supportRA_ = false;
numMemDependencies_ = GPU_NUM_MEM_DEPENDENCY;
break;
default:
assert(0 && "Unknown ASIC type!");
return false;
}
#if defined(_WIN32)
if (modifyMaxWorkload.time > 0) {
OSVERSIONINFOEX versionInfo = {0};
versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
versionInfo.dwMajorVersion = 6;
versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;
DWORDLONG conditionMask = 0;
VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
maxWorkloadTime_ = modifyMaxWorkload.time;
}
}
#endif // defined(_WIN32)
// Enable atomics support
enableExtension(ClKhrInt64BaseAtomics);
enableExtension(ClKhrInt64ExtendedAtomics);
enableExtension(ClKhrGlobalInt32BaseAtomics);
enableExtension(ClKhrGlobalInt32ExtendedAtomics);
enableExtension(ClKhrLocalInt32BaseAtomics);
enableExtension(ClKhrLocalInt32ExtendedAtomics);
enableExtension(ClKhrByteAddressableStore);
enableExtension(ClKhrGlSharing);
enableExtension(ClKhrGlEvent);
enableExtension(ClKhr3DImageWrites);
enableExtension(ClKhrImage2dFromBuffer);
enableExtension(ClAmdMediaOps);
enableExtension(ClAmdMediaOps2);
if (!useLightning_) {
enableExtension(ClAmdPopcnt);
enableExtension(ClAmdVec3);
enableExtension(ClAmdPrintf);
enableExtension(ClKhrSpir);
}
// Enable some platform extensions
enableExtension(ClAmdDeviceAttributeQuery);
#ifdef ATI_OS_LINUX
if (palProp.gpuMemoryProperties.busAddressableMemSize > 0)
#endif
{
enableExtension(ClAMDLiquidFlash);
}
hwLDSSize_ = (IS_LINUX || gfx10Plus_) ? 64 * Ki : 32 * Ki;
imageSupport_ = true;
// Use kernels for blit if appropriate
blitEngine_ = BlitEngineKernel;
hostMemDirectAccess_ |= HostMemBuffer;
// HW doesn't support untiled image writes
// hostMemDirectAccess_ |= HostMemImage;
// Make sure device actually supports double precision
doublePrecision_ = (doublePrecision) ? doublePrecision_ : false;
if (doublePrecision_) {
// Enable KHR double precision extension
enableExtension(ClKhrFp64);
}
if (!useLightning_ && doublePrecision) {
// Enable AMD double precision extension
doublePrecision_ = true;
enableExtension(ClAmdFp64);
}
if (palProp.gpuMemoryProperties.busAddressableMemSize > 0) {
// Enable bus addressable memory extension
enableExtension(ClAMDBusAddressableMemory);
}
//! @todo
/*
if (calAttr.longIdleDetect) {
// KMD is unable to detect if we map the visible memory for CPU access, so
// accessing persistent staged buffer may fail if LongIdleDetct is enabled.
disablePersistent_ = true;
}
*/
svmFineGrainSystem_ = palProp.gpuMemoryProperties.flags.iommuv2Support;
svmAtomics_ = svmFineGrainSystem_;
// SVM is not currently supported for DX Interop
#if defined(_WIN32)
enableExtension(ClKhrD3d9Sharing);
enableExtension(ClKhrD3d10Sharing);
enableExtension(ClKhrD3d11Sharing);
#endif // _WIN32
// Enable some OpenCL 2.0 extensions
if (oclVersion_ >= OpenCL20) {
enableExtension(ClKhrGLDepthImages);
enableExtension(ClKhrSubGroups);
enableExtension(ClKhrDepthImages);
if (GPU_MIPMAP) {
enableExtension(ClKhrMipMapImage);
enableExtension(ClKhrMipMapImageWrites);
}
// Enable HW debug
if (GPU_ENABLE_HW_DEBUG) {
enableHwDebug_ = true;
}
#if defined(_WIN32)
enableExtension(ClAmdPlanarYuv);
#endif
}
if (apuSystem_ &&
((heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize) < (150 * Mi))) {
remoteAlloc_ = true;
}
// Update resource cache size
if (remoteAlloc_) {
resourceCacheSize_ = std::max((heaps[Pal::GpuHeapGartUswc].heapSize / 8),
(uint64_t)GPU_RESOURCE_CACHE_SIZE * Mi);
} else {
resourceCacheSize_ =
std::max(((heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize) / 8),
(uint64_t)GPU_RESOURCE_CACHE_SIZE * Mi);
#if !defined(_LP64)
resourceCacheSize_ = std::min(resourceCacheSize_, 1 * Gi);
#endif
}
if (useLightning_) {
switch (palProp.gfxLevel) {
case Pal::GfxIpLevel::GfxIp10_1:
case Pal::GfxIpLevel::GfxIp10:
case Pal::GfxIpLevel::GfxIp9:
singleFpDenorm_ = true;
break;
default:
break;
}
}
// Override current device settings
override();
return true;
}
void Settings::override() {
// Limit reported workgroup size
if (GPU_MAX_WORKGROUP_SIZE != 0) {
preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
}
// Override blit engine type
if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) {
blitEngine_ = GPU_BLIT_ENGINE_TYPE;
}
if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
debugFlags_ = DEBUG_GPU_FLAGS;
}
if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
}
if (!flagIsDefault(GPU_NUM_COMPUTE_RINGS)) {
numComputeRings_ = GPU_NUM_COMPUTE_RINGS;
}
if (!flagIsDefault(GPU_RESOURCE_CACHE_SIZE)) {
resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi;
}
if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) {
switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) {
case 0:
singleFpDenorm_ = false;
break;
case 1:
singleFpDenorm_ = true;
break;
default:
break;
}
}
if (!flagIsDefault(GPU_MAX_COMMAND_BUFFERS)) {
maxCmdBuffers_ = GPU_MAX_COMMAND_BUFFERS;
}
}
} // namespace pal