From ffcdad85e522fc3f8df820165be5e2bbd48d6614 Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 19 May 2016 18:59:20 -0400
Subject: [PATCH] P4 to Git Change 1271191 by gandryey@gera-w8 on 2016/05/19
18:42:50
SWDEV-86035 - Add PAL backend to OpenCL
- Adds SDMA worakround for pagefault
- Removes directSRD and hsail flags
- Fixes a BSOD with the latets PAL on Fiji. KMD expects a valid UMD client
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#315 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#225 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#345 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.hpp#96 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/os/win/winPlatform.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#254 edit
---
rocclr/runtime/device/gpu/gpukernel.cpp | 49 +++++-------
rocclr/runtime/device/gpu/gpuprogram.cpp | 1 -
rocclr/runtime/device/gpu/gpusettings.cpp | 3 -
rocclr/runtime/device/gpu/gpusettings.hpp | 3 +-
rocclr/runtime/device/pal/paldevice.cpp | 93 ++++++++++-------------
rocclr/runtime/device/pal/palkernel.cpp | 49 +++++-------
rocclr/runtime/device/pal/palprogram.cpp | 1 -
rocclr/runtime/device/pal/palresource.cpp | 8 +-
rocclr/runtime/device/pal/palsettings.cpp | 18 ++---
rocclr/runtime/device/pal/palsettings.hpp | 49 ++++++------
rocclr/runtime/utils/flags.hpp | 2 -
11 files changed, 114 insertions(+), 162 deletions(-)
diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index 50f82b4161..4aa8b539ae 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -3889,30 +3889,23 @@ HSAILKernel::loadArguments(
//! \note syncCache may call DRM transfer
image->wait(gpu, WaitOnBusyEngine);
- if (dev().settings().hsailDirectSRD_) {
- // Image arguments are of size 48 bytes and aligned to 16 bytes
- WriteAqlArg(&aqlArgBuf, image->hwState(),
- HsaImageObjectSize, HsaImageObjectAlignment);
+ //! \note Special case for the image views.
+ //! Copy SRD to CB1, so blit manager will be able to release
+ //! this view without a wait for SRD resource.
+ if (image->memoryType() == Resource::ImageView) {
+ // Copy the current structre into CB1
+ memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
+ ConstBuffer* cb = gpu.constBufs_[1];
+ cb->uploadDataToHw(HsaImageObjectSize);
+ // Then use a pointer in aqlArgBuffer to CB1
+ uint64_t srd = cb->vmAddress() + cb->wrtOffset();
+ WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+ memList.push_back(cb);
}
else {
- //! \note Special case for the image views.
- //! Copy SRD to CB1, so blit manager will be able to release
- //! this view without a wait for SRD resource.
- if (image->memoryType() == Resource::ImageView) {
- // Copy the current structre into CB1
- memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
- ConstBuffer* cb = gpu.constBufs_[1];
- cb->uploadDataToHw(HsaImageObjectSize);
- // Then use a pointer in aqlArgBuffer to CB1
- uint64_t srd = cb->vmAddress() + cb->wrtOffset();
- WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
- memList.push_back(cb);
- }
- else {
- uint64_t srd = image->hwSrd();
- WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
- srdResource = true;
- }
+ uint64_t srd = image->hwSrd();
+ WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+ srdResource = true;
}
//! @todo Compiler has to return read/write attributes
@@ -3929,15 +3922,9 @@ HSAILKernel::loadArguments(
*reinterpret_cast(paramaddr);
const Sampler* gpuSampler = static_cast
(sampler->getDeviceSampler(dev()));
- if (dev().settings().hsailDirectSRD_) {
- WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
- HsaSamplerObjectSize, HsaSamplerObjectAlignment);
- }
- else {
- uint64_t srd = gpuSampler->hwSrd();
- WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
- srdResource = true;
- }
+ uint64_t srd = gpuSampler->hwSrd();
+ WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+ srdResource = true;
break;
}
case HSAIL_ARGTYPE_QUEUE: {
diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp
index c9a18defb1..2073b339a8 100644
--- a/rocclr/runtime/device/gpu/gpuprogram.cpp
+++ b/rocclr/runtime/device/gpu/gpuprogram.cpp
@@ -2494,7 +2494,6 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate(
assert(false);
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
- assert(!program_->dev().settings().hsailDirectSRD_);
gpu::Sampler* sampler = new gpu::Sampler(program_->dev());
if (!sampler || !sampler->create(state)) {
delete sampler;
diff --git a/rocclr/runtime/device/gpu/gpusettings.cpp b/rocclr/runtime/device/gpu/gpusettings.cpp
index b9f7f7c628..4084e2df9f 100644
--- a/rocclr/runtime/device/gpu/gpusettings.cpp
+++ b/rocclr/runtime/device/gpu/gpusettings.cpp
@@ -135,9 +135,6 @@ Settings::Settings()
// Don't support platform atomics by default.
svmAtomics_ = false;
- // Use direct SRD by default
- hsailDirectSRD_ = GPU_DIRECT_SRD;
-
// Use host queue for device enqueuing by default
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
diff --git a/rocclr/runtime/device/gpu/gpusettings.hpp b/rocclr/runtime/device/gpu/gpusettings.hpp
index 585e2711d8..e3777dc904 100644
--- a/rocclr/runtime/device/gpu/gpusettings.hpp
+++ b/rocclr/runtime/device/gpu/gpusettings.hpp
@@ -69,10 +69,9 @@ public:
uint svmFineGrainSystem_: 1; //!< SVM fine grain system support
uint apuSystem_: 1; //!< Device is APU system with shared memory
uint asyncMemCopy_: 1; //!< Use async memory transfers
- uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
uint useDeviceQueue_: 1; //!< Submit to separate device queue
uint singleFpDenorm_: 1; //!< Support Single FP Denorm
- uint reserved_: 3;
+ uint reserved_: 4;
};
uint value_;
};
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 90c81e2a6a..faa245d9f4 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -149,19 +149,14 @@ NullDevice::create(Pal::GfxIpLevel ipLevel)
device::Program*
NullDevice::createProgram(amd::option::Options* options)
{
- device::Program* nullProgram;
- if (settings().hsail_) {
- nullProgram = new HSAILProgram(*this);
- }
- else {
- // AMDIL path
- ShouldNotReachHere();
- }
- if (nullProgram == nullptr) {
+ device::Program* program;
+ program = new HSAILProgram(*this);
+
+ if (program == nullptr) {
LogError("Memory allocation has failed!");
}
- return nullProgram;
+ return program;
}
void NullDevice::fillDeviceInfo(
@@ -643,19 +638,26 @@ Device::create(Pal::IDevice* device)
// Update HW info for the device
hwInfo_ = &DeviceInfo[static_cast(properties().revision)];
+ // Find the number of available engines
+ numComputeEngines_ =
+ properties().engineProperties[Pal::QueueTypeCompute].engineCount;
+ numDmaEngines_ =
+ properties().engineProperties[Pal::QueueTypeDma].engineCount;
+
Pal::PalPublicSettings*const palSettings = iDev()->GetPublicSettings();
// Modify settings here
// palSettings ...
palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled;
+
// Commit the new settings for the device
result = iDev()->CommitSettingsAndInit();
if (result == Pal::Result::Success) {
Pal::DeviceFinalizeInfo finalizeInfo = {};
- // Request 2 compute engines
- finalizeInfo.engineCounts[Pal::QueueTypeCompute] = 2;
- // Request 2 SDMA engines
- finalizeInfo.engineCounts[Pal::QueueTypeDma] = 2;
+ // Request all compute engines
+ finalizeInfo.engineCounts[Pal::QueueTypeCompute] = numComputeEngines_;
+ // Request all SDMA engines
+ finalizeInfo.engineCounts[Pal::QueueTypeDma] = numDmaEngines_;
result = iDev()->Finalize(finalizeInfo);
}
@@ -670,12 +672,6 @@ Device::create(Pal::IDevice* device)
appProfile_.reportAsOCL12Device())) {
return false;
}
-
- // Find the number of available engines
- numComputeEngines_ =
- properties().engineProperties[Pal::QueueTypeCompute].engineCount;
- numDmaEngines_ =
- properties().engineProperties[Pal::QueueTypeDma].engineCount;
numComputeEngines_ = std::min(numComputeEngines_, settings().numComputeRings_);
amd::Context::Info info = {0};
@@ -816,22 +812,20 @@ Device::initializeHeapResources()
}
// Delay compilation due to brig_loader memory allocation
- if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
- const char* scheduler = nullptr;
- const char* ocl20 = nullptr;
- if (settings().oclVersion_ == OpenCL20) {
- scheduler = SchedulerSourceCode;
- ocl20 = "-cl-std=CL2.0";
- }
- blitProgram_ = new BlitProgram(context_);
- // Create blit programs
- if (blitProgram_ == nullptr ||
- !blitProgram_->create(this, scheduler, ocl20)) {
- delete blitProgram_;
- blitProgram_ = nullptr;
- LogError("Couldn't create blit kernels!");
- return false;
- }
+ const char* scheduler = nullptr;
+ const char* ocl20 = nullptr;
+ if (settings().oclVersion_ == OpenCL20) {
+ scheduler = SchedulerSourceCode;
+ ocl20 = "-cl-std=CL2.0";
+ }
+ blitProgram_ = new BlitProgram(context_);
+ // Create blit programs
+ if (blitProgram_ == nullptr ||
+ !blitProgram_->create(this, scheduler, ocl20)) {
+ delete blitProgram_;
+ blitProgram_ = nullptr;
+ LogError("Couldn't create blit kernels!");
+ return false;
}
// Create a synchronized transfer queue
@@ -900,20 +894,13 @@ Device::createVirtualDevice(
device::Program*
Device::createProgram(amd::option::Options* options)
{
- device::Program* gpuProgram;
- if (settings().hsail_) {
- gpuProgram = new HSAILProgram(*this);
- }
- else {
- ShouldNotReachHere();
- //AMDIL
- //gpuProgram = new Program(*this);
- }
- if (gpuProgram == nullptr) {
+ device::Program* program;
+ program = new HSAILProgram(*this);
+ if (program == nullptr) {
LogError("We failed memory allocation for program!");
}
- return gpuProgram;
+ return program;
}
//! Requested devices list as configured by the GPU_DEVICE_ORDINAL
@@ -1410,14 +1397,12 @@ bool
Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const
{
*sampler = nullptr;
- if (settings().hsail_ || (settings().oclVersion_ >= OpenCL20)) {
- Sampler* gpuSampler = new Sampler(*this);
- if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) {
- delete gpuSampler;
- return false;
- }
- *sampler = gpuSampler;
+ Sampler* gpuSampler = new Sampler(*this);
+ if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) {
+ delete gpuSampler;
+ return false;
}
+ *sampler = gpuSampler;
return true;
}
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index ea82fd075a..5334e28931 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -1063,30 +1063,23 @@ HSAILKernel::loadArguments(
//! \note syncCache may call DRM transfer
image->wait(gpu, WaitOnBusyEngine);
- if (dev().settings().hsailDirectSRD_) {
- // Image arguments are of size 48 bytes and aligned to 16 bytes
- WriteAqlArg(&aqlArgBuf, image->hwState(),
- HsaImageObjectSize, HsaImageObjectAlignment);
+ //! \note Special case for the image views.
+ //! Copy SRD to CB1, so blit manager will be able to release
+ //! this view without a wait for SRD resource.
+ if (image->memoryType() == Resource::ImageView) {
+ // Copy the current structre into CB1
+ memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
+ ConstBuffer* cb = gpu.constBufs_[1];
+ cb->uploadDataToHw(HsaImageObjectSize);
+ // Then use a pointer in aqlArgBuffer to CB1
+ uint64_t srd = cb->vmAddress() + cb->wrtOffset();
+ WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+ memList.push_back(cb);
}
else {
- //! \note Special case for the image views.
- //! Copy SRD to CB1, so blit manager will be able to release
- //! this view without a wait for SRD resource.
- if (image->memoryType() == Resource::ImageView) {
- // Copy the current structre into CB1
- memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
- ConstBuffer* cb = gpu.constBufs_[1];
- cb->uploadDataToHw(HsaImageObjectSize);
- // Then use a pointer in aqlArgBuffer to CB1
- uint64_t srd = cb->vmAddress() + cb->wrtOffset();
- WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
- memList.push_back(cb);
- }
- else {
- uint64_t srd = image->hwSrd();
- WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
- srdResource = true;
- }
+ uint64_t srd = image->hwSrd();
+ WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+ srdResource = true;
}
//! @todo Compiler has to return read/write attributes
@@ -1103,15 +1096,9 @@ HSAILKernel::loadArguments(
*reinterpret_cast(paramaddr);
const Sampler* gpuSampler = static_cast
(sampler->getDeviceSampler(dev()));
- if (dev().settings().hsailDirectSRD_) {
- WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
- HsaSamplerObjectSize, HsaSamplerObjectAlignment);
- }
- else {
- uint64_t srd = gpuSampler->hwSrd();
- WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
- srdResource = true;
- }
+ uint64_t srd = gpuSampler->hwSrd();
+ WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+ srdResource = true;
break;
}
case HSAIL_ARGTYPE_QUEUE: {
diff --git a/rocclr/runtime/device/pal/palprogram.cpp b/rocclr/runtime/device/pal/palprogram.cpp
index d677959ff9..1535faf103 100644
--- a/rocclr/runtime/device/pal/palprogram.cpp
+++ b/rocclr/runtime/device/pal/palprogram.cpp
@@ -844,7 +844,6 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate(
assert(false);
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
- assert(!program_->dev().settings().hsailDirectSRD_);
pal::Sampler* sampler = new pal::Sampler(program_->dev());
if (!sampler || !sampler->create(state)) {
delete sampler;
diff --git a/rocclr/runtime/device/pal/palresource.cpp b/rocclr/runtime/device/pal/palresource.cpp
index f131de6adf..487ea8be41 100644
--- a/rocclr/runtime/device/pal/palresource.cpp
+++ b/rocclr/runtime/device/pal/palresource.cpp
@@ -1178,7 +1178,9 @@ Resource::partialMemCopyTo(
// Make sure linear pitch in bytes is 4 bytes aligned
if (((copyRegion.gpuMemoryRowPitch % 4) != 0) ||
// another DRM restriciton... SI has 4 pixels
- (copyRegion.gpuMemoryOffset % 4 != 0)) {
+ (copyRegion.gpuMemoryOffset % 4 != 0) ||
+ (dev().settings().sdamPageFaultWar_ &&
+ (copyRegion.imageOffset.x % dstResource.elementSize() != 0))) {
result = false;
}
else {
@@ -1204,7 +1206,9 @@ Resource::partialMemCopyTo(
// Make sure linear pitch in bytes is 4 bytes aligned
if (((copyRegion.gpuMemoryRowPitch % 4) != 0) ||
// another DRM restriciton... SI has 4 pixels
- (copyRegion.gpuMemoryOffset % 4 != 0)) {
+ (copyRegion.gpuMemoryOffset % 4 != 0) ||
+ (dev().settings().sdamPageFaultWar_ &&
+ (copyRegion.imageOffset.x % elementSize() != 0))) {
result = false;
}
else {
diff --git a/rocclr/runtime/device/pal/palsettings.cpp b/rocclr/runtime/device/pal/palsettings.cpp
index 2a239d03e3..86c5542fd4 100644
--- a/rocclr/runtime/device/pal/palsettings.cpp
+++ b/rocclr/runtime/device/pal/palsettings.cpp
@@ -115,20 +115,17 @@ Settings::Settings()
numDeviceEvents_ = 1024;
numWaitEvents_ = 8;
- // Disable HSAIL by default
- hsail_ = false;
-
// Don't support platform atomics by default.
svmAtomics_ = false;
- // Use direct SRD by default
- hsailDirectSRD_ = GPU_DIRECT_SRD;
-
// Use host queue for device enqueuing by default
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
// Don't support Denormals for single precision by default
singleFpDenorm_ = false;
+
+ // Disable SDMA workaround by default
+ sdamPageFaultWar_ = false;
}
bool
@@ -179,6 +176,9 @@ Settings::create(
// Keep this false even though we have support
// singleFpDenorm_ = true;
viPlus_ = true;
+ // SDMA may have memory access outside of
+ // the valid buffer range and cause a page fault
+ sdamPageFaultWar_ = true;
// Fall through to CI ...
case Pal::AsicRevision::Kalindi:
case Pal::AsicRevision::Spectre:
@@ -193,7 +193,6 @@ Settings::create(
case Pal::AsicRevision::Bonaire:
case Pal::AsicRevision::Hawaii:
ciPlus_ = true;
- hsail_ = true;
threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE;
reportFMAF_ = false;
if (palProp.revision == Pal::AsicRevision::Hawaii) {
@@ -228,11 +227,10 @@ Settings::create(
// This needs to be cleaned once 64bit addressing is stable
if (oclVersion_ < OpenCL20) {
use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) ? LP64_SWITCH(false,
- /*calAttr.isWorkstation ||*/ hsail_) : GPU_FORCE_64BIT_PTR;
+ /*calAttr.isWorkstation ||*/ true) : GPU_FORCE_64BIT_PTR;
}
else {
- if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
- || (oclVersion_ >= OpenCL20)))) {
+ if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) {
use64BitPtr_ = true;
}
}
diff --git a/rocclr/runtime/device/pal/palsettings.hpp b/rocclr/runtime/device/pal/palsettings.hpp
index 4aea512f7c..50a27a9a43 100644
--- a/rocclr/runtime/device/pal/palsettings.hpp
+++ b/rocclr/runtime/device/pal/palsettings.hpp
@@ -50,56 +50,55 @@ public:
uint disablePersistent_: 1; //!< Disables using persistent memory for staging
uint imageSupport_: 1; //!< Report images support
uint doublePrecision_: 1; //!< Enables double precision support
- uint reportFMAF_: 1; //!< Report FP_FAST_FMAF define in CL program
- uint reportFMA_: 1; //!< Report FP_FAST_FMA define in CL program
- uint use64BitPtr_: 1; //!< Use 64bit pointers on GPU
+ uint reportFMAF_: 1; //!< Report FP_FAST_FMAF define in CL program
+ uint reportFMA_: 1; //!< Report FP_FAST_FMA define in CL program
+ uint use64BitPtr_: 1; //!< Use 64bit pointers on GPU
uint force32BitOcl20_: 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU
- uint imageDMA_: 1; //!< Enable direct image DMA transfers
- uint syncObject_: 1; //!< Enable syncobject
- uint ciPlus_: 1; //!< CI and post CI features
- uint viPlus_: 1; //!< VI and post VI features
- uint aiPlus_: 1; //!< AI and post AI features
+ uint imageDMA_: 1; //!< Enable direct image DMA transfers
+ uint syncObject_: 1; //!< Enable syncobject
+ uint ciPlus_: 1; //!< CI and post CI features
+ uint viPlus_: 1; //!< VI and post VI features
+ uint aiPlus_: 1; //!< AI and post AI features
uint threadTraceEnable_: 1; //!< Thread trace enable
uint linearPersistentImage_: 1; //!< Allocates linear images in persistent
uint useSingleScratch_: 1; //!< Allocates single scratch per device
- uint hsail_: 1; //!< Enables HSAIL compilation
uint stagingWritePersistent_: 1; //!< Enables persistent writes
- uint svmAtomics_: 1; //!< SVM device atomics
- uint svmFineGrainSystem_: 1; //!< SVM fine grain system support
- uint apuSystem_: 1; //!< Device is APU system with shared memory
- uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
- uint useDeviceQueue_: 1; //!< Submit to separate device queue
- uint singleFpDenorm_: 1; //!< Support Single FP Denorm
- uint reserved_: 5;
+ uint svmAtomics_: 1; //!< SVM device atomics
+ uint svmFineGrainSystem_: 1; //!< SVM fine grain system support
+ uint apuSystem_: 1; //!< Device is APU system with shared memory
+ uint useDeviceQueue_: 1; //!< Submit to separate device queue
+ uint singleFpDenorm_: 1; //!< Support Single FP Denorm
+ uint sdamPageFaultWar_: 1; //!< SDAM page fault workaround
+ uint reserved_: 7;
};
uint value_;
};
uint oclVersion_; //!< Reported OpenCL version support
uint debugFlags_; //!< Debug GPU flags
- size_t stagedXferSize_; //!< Staged buffer size
uint maxRenames_; //!< Maximum number of possible renames
uint maxRenameSize_; //!< Maximum size for all renames
uint hwLDSSize_; //!< HW local data store size
uint maxWorkGroupSize_; //!< Requested workgroup size for this device
- uint hostMemDirectAccess_; //!< Enables direct access to the host memory
- amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
uint workloadSplitSize_; //!< Workload split size
uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms
uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms
uint blitEngine_; //!< Blit engine type
- size_t pinnedXferSize_; //!< Pinned buffer size for transfer
- size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
- size_t resourceCacheSize_; //!< Resource cache size in MB
- uint64_t maxAllocSize_; //!< Maximum single allocation size
- size_t numMemDependencies_;//!< The array size for memory dependencies tracking
uint cacheLineSize_; //!< Cache line size in bytes
uint cacheSize_; //!< L1 cache size in bytes
- size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings
uint numDeviceEvents_; //!< The number of device events
uint numWaitEvents_; //!< The number of wait events for device enqueue
+ uint hostMemDirectAccess_; //!< Enables direct access to the host memory
+ size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
+ size_t stagedXferSize_; //!< Staged buffer size
+ size_t pinnedXferSize_; //!< Pinned buffer size for transfer
+ size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
+ size_t resourceCacheSize_; //!< Resource cache size in MB
+ size_t numMemDependencies_;//!< The array size for memory dependencies tracking
+ uint64_t maxAllocSize_; //!< Maximum single allocation size
+ amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
//! Default constructor
Settings();
diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp
index 674a11ac3f..cd1b77bd6a 100644
--- a/rocclr/runtime/utils/flags.hpp
+++ b/rocclr/runtime/utils/flags.hpp
@@ -139,8 +139,6 @@ release(bool, GPU_HSAIL_ENABLE, LP64_SWITCH(LINUX_SWITCH(false,true),true), \
"Enable HSAIL on dGPU stack (requires CI+ HW)") \
release(uint, GPU_PRINT_CHILD_KERNEL, 0, \
"Prints the specified number of the child kernels") \
-release(bool, GPU_DIRECT_SRD, false, \
- "Use indirect SRD access in HSAIL") \
release(bool, GPU_USE_DEVICE_QUEUE, false, \
"Use a dedicated device queue for the actual submissions") \
release(bool, GPU_ENABLE_LARGE_ALLOCATION, true, \