From ffcdad85e522fc3f8df820165be5e2bbd48d6614 Mon Sep 17 00:00:00 2001 From: foreman Date: Thu, 19 May 2016 18:59:20 -0400 Subject: [PATCH] P4 to Git Change 1271191 by gandryey@gera-w8 on 2016/05/19 18:42:50 SWDEV-86035 - Add PAL backend to OpenCL - Adds SDMA worakround for pagefault - Removes directSRD and hsail flags - Fixes a BSOD with the latets PAL on Fiji. KMD expects a valid UMD client Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#315 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#225 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#345 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.hpp#96 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/os/win/winPlatform.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#7 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#254 edit --- rocclr/runtime/device/gpu/gpukernel.cpp | 49 +++++------- rocclr/runtime/device/gpu/gpuprogram.cpp | 1 - rocclr/runtime/device/gpu/gpusettings.cpp | 3 - rocclr/runtime/device/gpu/gpusettings.hpp | 3 +- rocclr/runtime/device/pal/paldevice.cpp | 93 ++++++++++------------- rocclr/runtime/device/pal/palkernel.cpp | 49 +++++------- rocclr/runtime/device/pal/palprogram.cpp | 1 - rocclr/runtime/device/pal/palresource.cpp | 8 +- rocclr/runtime/device/pal/palsettings.cpp | 18 ++--- rocclr/runtime/device/pal/palsettings.hpp | 49 ++++++------ rocclr/runtime/utils/flags.hpp | 2 - 11 files changed, 114 insertions(+), 162 deletions(-) diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp index 50f82b4161..4aa8b539ae 100644 --- a/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/rocclr/runtime/device/gpu/gpukernel.cpp @@ -3889,30 +3889,23 @@ HSAILKernel::loadArguments( //! \note syncCache may call DRM transfer image->wait(gpu, WaitOnBusyEngine); - if (dev().settings().hsailDirectSRD_) { - // Image arguments are of size 48 bytes and aligned to 16 bytes - WriteAqlArg(&aqlArgBuf, image->hwState(), - HsaImageObjectSize, HsaImageObjectAlignment); + //! \note Special case for the image views. + //! Copy SRD to CB1, so blit manager will be able to release + //! this view without a wait for SRD resource. + if (image->memoryType() == Resource::ImageView) { + // Copy the current structre into CB1 + memcpy(aqlStruct, image->hwState(), HsaImageObjectSize); + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(HsaImageObjectSize); + // Then use a pointer in aqlArgBuffer to CB1 + uint64_t srd = cb->vmAddress() + cb->wrtOffset(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + memList.push_back(cb); } else { - //! \note Special case for the image views. - //! Copy SRD to CB1, so blit manager will be able to release - //! this view without a wait for SRD resource. - if (image->memoryType() == Resource::ImageView) { - // Copy the current structre into CB1 - memcpy(aqlStruct, image->hwState(), HsaImageObjectSize); - ConstBuffer* cb = gpu.constBufs_[1]; - cb->uploadDataToHw(HsaImageObjectSize); - // Then use a pointer in aqlArgBuffer to CB1 - uint64_t srd = cb->vmAddress() + cb->wrtOffset(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); - memList.push_back(cb); - } - else { - uint64_t srd = image->hwSrd(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); - srdResource = true; - } + uint64_t srd = image->hwSrd(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + srdResource = true; } //! @todo Compiler has to return read/write attributes @@ -3929,15 +3922,9 @@ HSAILKernel::loadArguments( *reinterpret_cast(paramaddr); const Sampler* gpuSampler = static_cast (sampler->getDeviceSampler(dev())); - if (dev().settings().hsailDirectSRD_) { - WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(), - HsaSamplerObjectSize, HsaSamplerObjectAlignment); - } - else { - uint64_t srd = gpuSampler->hwSrd(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); - srdResource = true; - } + uint64_t srd = gpuSampler->hwSrd(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + srdResource = true; break; } case HSAIL_ARGTYPE_QUEUE: { diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp index c9a18defb1..2073b339a8 100644 --- a/rocclr/runtime/device/gpu/gpuprogram.cpp +++ b/rocclr/runtime/device/gpu/gpuprogram.cpp @@ -2494,7 +2494,6 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate( assert(false); return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - assert(!program_->dev().settings().hsailDirectSRD_); gpu::Sampler* sampler = new gpu::Sampler(program_->dev()); if (!sampler || !sampler->create(state)) { delete sampler; diff --git a/rocclr/runtime/device/gpu/gpusettings.cpp b/rocclr/runtime/device/gpu/gpusettings.cpp index b9f7f7c628..4084e2df9f 100644 --- a/rocclr/runtime/device/gpu/gpusettings.cpp +++ b/rocclr/runtime/device/gpu/gpusettings.cpp @@ -135,9 +135,6 @@ Settings::Settings() // Don't support platform atomics by default. svmAtomics_ = false; - // Use direct SRD by default - hsailDirectSRD_ = GPU_DIRECT_SRD; - // Use host queue for device enqueuing by default useDeviceQueue_ = GPU_USE_DEVICE_QUEUE; diff --git a/rocclr/runtime/device/gpu/gpusettings.hpp b/rocclr/runtime/device/gpu/gpusettings.hpp index 585e2711d8..e3777dc904 100644 --- a/rocclr/runtime/device/gpu/gpusettings.hpp +++ b/rocclr/runtime/device/gpu/gpusettings.hpp @@ -69,10 +69,9 @@ public: uint svmFineGrainSystem_: 1; //!< SVM fine grain system support uint apuSystem_: 1; //!< Device is APU system with shared memory uint asyncMemCopy_: 1; //!< Use async memory transfers - uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL uint useDeviceQueue_: 1; //!< Submit to separate device queue uint singleFpDenorm_: 1; //!< Support Single FP Denorm - uint reserved_: 3; + uint reserved_: 4; }; uint value_; }; diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp index 90c81e2a6a..faa245d9f4 100644 --- a/rocclr/runtime/device/pal/paldevice.cpp +++ b/rocclr/runtime/device/pal/paldevice.cpp @@ -149,19 +149,14 @@ NullDevice::create(Pal::GfxIpLevel ipLevel) device::Program* NullDevice::createProgram(amd::option::Options* options) { - device::Program* nullProgram; - if (settings().hsail_) { - nullProgram = new HSAILProgram(*this); - } - else { - // AMDIL path - ShouldNotReachHere(); - } - if (nullProgram == nullptr) { + device::Program* program; + program = new HSAILProgram(*this); + + if (program == nullptr) { LogError("Memory allocation has failed!"); } - return nullProgram; + return program; } void NullDevice::fillDeviceInfo( @@ -643,19 +638,26 @@ Device::create(Pal::IDevice* device) // Update HW info for the device hwInfo_ = &DeviceInfo[static_cast(properties().revision)]; + // Find the number of available engines + numComputeEngines_ = + properties().engineProperties[Pal::QueueTypeCompute].engineCount; + numDmaEngines_ = + properties().engineProperties[Pal::QueueTypeDma].engineCount; + Pal::PalPublicSettings*const palSettings = iDev()->GetPublicSettings(); // Modify settings here // palSettings ... palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled; + // Commit the new settings for the device result = iDev()->CommitSettingsAndInit(); if (result == Pal::Result::Success) { Pal::DeviceFinalizeInfo finalizeInfo = {}; - // Request 2 compute engines - finalizeInfo.engineCounts[Pal::QueueTypeCompute] = 2; - // Request 2 SDMA engines - finalizeInfo.engineCounts[Pal::QueueTypeDma] = 2; + // Request all compute engines + finalizeInfo.engineCounts[Pal::QueueTypeCompute] = numComputeEngines_; + // Request all SDMA engines + finalizeInfo.engineCounts[Pal::QueueTypeDma] = numDmaEngines_; result = iDev()->Finalize(finalizeInfo); } @@ -670,12 +672,6 @@ Device::create(Pal::IDevice* device) appProfile_.reportAsOCL12Device())) { return false; } - - // Find the number of available engines - numComputeEngines_ = - properties().engineProperties[Pal::QueueTypeCompute].engineCount; - numDmaEngines_ = - properties().engineProperties[Pal::QueueTypeDma].engineCount; numComputeEngines_ = std::min(numComputeEngines_, settings().numComputeRings_); amd::Context::Info info = {0}; @@ -816,22 +812,20 @@ Device::initializeHeapResources() } // Delay compilation due to brig_loader memory allocation - if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) { - const char* scheduler = nullptr; - const char* ocl20 = nullptr; - if (settings().oclVersion_ == OpenCL20) { - scheduler = SchedulerSourceCode; - ocl20 = "-cl-std=CL2.0"; - } - blitProgram_ = new BlitProgram(context_); - // Create blit programs - if (blitProgram_ == nullptr || - !blitProgram_->create(this, scheduler, ocl20)) { - delete blitProgram_; - blitProgram_ = nullptr; - LogError("Couldn't create blit kernels!"); - return false; - } + const char* scheduler = nullptr; + const char* ocl20 = nullptr; + if (settings().oclVersion_ == OpenCL20) { + scheduler = SchedulerSourceCode; + ocl20 = "-cl-std=CL2.0"; + } + blitProgram_ = new BlitProgram(context_); + // Create blit programs + if (blitProgram_ == nullptr || + !blitProgram_->create(this, scheduler, ocl20)) { + delete blitProgram_; + blitProgram_ = nullptr; + LogError("Couldn't create blit kernels!"); + return false; } // Create a synchronized transfer queue @@ -900,20 +894,13 @@ Device::createVirtualDevice( device::Program* Device::createProgram(amd::option::Options* options) { - device::Program* gpuProgram; - if (settings().hsail_) { - gpuProgram = new HSAILProgram(*this); - } - else { - ShouldNotReachHere(); - //AMDIL - //gpuProgram = new Program(*this); - } - if (gpuProgram == nullptr) { + device::Program* program; + program = new HSAILProgram(*this); + if (program == nullptr) { LogError("We failed memory allocation for program!"); } - return gpuProgram; + return program; } //! Requested devices list as configured by the GPU_DEVICE_ORDINAL @@ -1410,14 +1397,12 @@ bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const { *sampler = nullptr; - if (settings().hsail_ || (settings().oclVersion_ >= OpenCL20)) { - Sampler* gpuSampler = new Sampler(*this); - if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) { - delete gpuSampler; - return false; - } - *sampler = gpuSampler; + Sampler* gpuSampler = new Sampler(*this); + if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) { + delete gpuSampler; + return false; } + *sampler = gpuSampler; return true; } diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp index ea82fd075a..5334e28931 100644 --- a/rocclr/runtime/device/pal/palkernel.cpp +++ b/rocclr/runtime/device/pal/palkernel.cpp @@ -1063,30 +1063,23 @@ HSAILKernel::loadArguments( //! \note syncCache may call DRM transfer image->wait(gpu, WaitOnBusyEngine); - if (dev().settings().hsailDirectSRD_) { - // Image arguments are of size 48 bytes and aligned to 16 bytes - WriteAqlArg(&aqlArgBuf, image->hwState(), - HsaImageObjectSize, HsaImageObjectAlignment); + //! \note Special case for the image views. + //! Copy SRD to CB1, so blit manager will be able to release + //! this view without a wait for SRD resource. + if (image->memoryType() == Resource::ImageView) { + // Copy the current structre into CB1 + memcpy(aqlStruct, image->hwState(), HsaImageObjectSize); + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(HsaImageObjectSize); + // Then use a pointer in aqlArgBuffer to CB1 + uint64_t srd = cb->vmAddress() + cb->wrtOffset(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + memList.push_back(cb); } else { - //! \note Special case for the image views. - //! Copy SRD to CB1, so blit manager will be able to release - //! this view without a wait for SRD resource. - if (image->memoryType() == Resource::ImageView) { - // Copy the current structre into CB1 - memcpy(aqlStruct, image->hwState(), HsaImageObjectSize); - ConstBuffer* cb = gpu.constBufs_[1]; - cb->uploadDataToHw(HsaImageObjectSize); - // Then use a pointer in aqlArgBuffer to CB1 - uint64_t srd = cb->vmAddress() + cb->wrtOffset(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); - memList.push_back(cb); - } - else { - uint64_t srd = image->hwSrd(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); - srdResource = true; - } + uint64_t srd = image->hwSrd(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + srdResource = true; } //! @todo Compiler has to return read/write attributes @@ -1103,15 +1096,9 @@ HSAILKernel::loadArguments( *reinterpret_cast(paramaddr); const Sampler* gpuSampler = static_cast (sampler->getDeviceSampler(dev())); - if (dev().settings().hsailDirectSRD_) { - WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(), - HsaSamplerObjectSize, HsaSamplerObjectAlignment); - } - else { - uint64_t srd = gpuSampler->hwSrd(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); - srdResource = true; - } + uint64_t srd = gpuSampler->hwSrd(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + srdResource = true; break; } case HSAIL_ARGTYPE_QUEUE: { diff --git a/rocclr/runtime/device/pal/palprogram.cpp b/rocclr/runtime/device/pal/palprogram.cpp index d677959ff9..1535faf103 100644 --- a/rocclr/runtime/device/pal/palprogram.cpp +++ b/rocclr/runtime/device/pal/palprogram.cpp @@ -844,7 +844,6 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate( assert(false); return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - assert(!program_->dev().settings().hsailDirectSRD_); pal::Sampler* sampler = new pal::Sampler(program_->dev()); if (!sampler || !sampler->create(state)) { delete sampler; diff --git a/rocclr/runtime/device/pal/palresource.cpp b/rocclr/runtime/device/pal/palresource.cpp index f131de6adf..487ea8be41 100644 --- a/rocclr/runtime/device/pal/palresource.cpp +++ b/rocclr/runtime/device/pal/palresource.cpp @@ -1178,7 +1178,9 @@ Resource::partialMemCopyTo( // Make sure linear pitch in bytes is 4 bytes aligned if (((copyRegion.gpuMemoryRowPitch % 4) != 0) || // another DRM restriciton... SI has 4 pixels - (copyRegion.gpuMemoryOffset % 4 != 0)) { + (copyRegion.gpuMemoryOffset % 4 != 0) || + (dev().settings().sdamPageFaultWar_ && + (copyRegion.imageOffset.x % dstResource.elementSize() != 0))) { result = false; } else { @@ -1204,7 +1206,9 @@ Resource::partialMemCopyTo( // Make sure linear pitch in bytes is 4 bytes aligned if (((copyRegion.gpuMemoryRowPitch % 4) != 0) || // another DRM restriciton... SI has 4 pixels - (copyRegion.gpuMemoryOffset % 4 != 0)) { + (copyRegion.gpuMemoryOffset % 4 != 0) || + (dev().settings().sdamPageFaultWar_ && + (copyRegion.imageOffset.x % elementSize() != 0))) { result = false; } else { diff --git a/rocclr/runtime/device/pal/palsettings.cpp b/rocclr/runtime/device/pal/palsettings.cpp index 2a239d03e3..86c5542fd4 100644 --- a/rocclr/runtime/device/pal/palsettings.cpp +++ b/rocclr/runtime/device/pal/palsettings.cpp @@ -115,20 +115,17 @@ Settings::Settings() numDeviceEvents_ = 1024; numWaitEvents_ = 8; - // Disable HSAIL by default - hsail_ = false; - // Don't support platform atomics by default. svmAtomics_ = false; - // Use direct SRD by default - hsailDirectSRD_ = GPU_DIRECT_SRD; - // Use host queue for device enqueuing by default useDeviceQueue_ = GPU_USE_DEVICE_QUEUE; // Don't support Denormals for single precision by default singleFpDenorm_ = false; + + // Disable SDMA workaround by default + sdamPageFaultWar_ = false; } bool @@ -179,6 +176,9 @@ Settings::create( // Keep this false even though we have support // singleFpDenorm_ = true; viPlus_ = true; + // SDMA may have memory access outside of + // the valid buffer range and cause a page fault + sdamPageFaultWar_ = true; // Fall through to CI ... case Pal::AsicRevision::Kalindi: case Pal::AsicRevision::Spectre: @@ -193,7 +193,6 @@ Settings::create( case Pal::AsicRevision::Bonaire: case Pal::AsicRevision::Hawaii: ciPlus_ = true; - hsail_ = true; threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE; reportFMAF_ = false; if (palProp.revision == Pal::AsicRevision::Hawaii) { @@ -228,11 +227,10 @@ Settings::create( // This needs to be cleaned once 64bit addressing is stable if (oclVersion_ < OpenCL20) { use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) ? LP64_SWITCH(false, - /*calAttr.isWorkstation ||*/ hsail_) : GPU_FORCE_64BIT_PTR; + /*calAttr.isWorkstation ||*/ true) : GPU_FORCE_64BIT_PTR; } else { - if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_ - || (oclVersion_ >= OpenCL20)))) { + if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) { use64BitPtr_ = true; } } diff --git a/rocclr/runtime/device/pal/palsettings.hpp b/rocclr/runtime/device/pal/palsettings.hpp index 4aea512f7c..50a27a9a43 100644 --- a/rocclr/runtime/device/pal/palsettings.hpp +++ b/rocclr/runtime/device/pal/palsettings.hpp @@ -50,56 +50,55 @@ public: uint disablePersistent_: 1; //!< Disables using persistent memory for staging uint imageSupport_: 1; //!< Report images support uint doublePrecision_: 1; //!< Enables double precision support - uint reportFMAF_: 1; //!< Report FP_FAST_FMAF define in CL program - uint reportFMA_: 1; //!< Report FP_FAST_FMA define in CL program - uint use64BitPtr_: 1; //!< Use 64bit pointers on GPU + uint reportFMAF_: 1; //!< Report FP_FAST_FMAF define in CL program + uint reportFMA_: 1; //!< Report FP_FAST_FMA define in CL program + uint use64BitPtr_: 1; //!< Use 64bit pointers on GPU uint force32BitOcl20_: 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU - uint imageDMA_: 1; //!< Enable direct image DMA transfers - uint syncObject_: 1; //!< Enable syncobject - uint ciPlus_: 1; //!< CI and post CI features - uint viPlus_: 1; //!< VI and post VI features - uint aiPlus_: 1; //!< AI and post AI features + uint imageDMA_: 1; //!< Enable direct image DMA transfers + uint syncObject_: 1; //!< Enable syncobject + uint ciPlus_: 1; //!< CI and post CI features + uint viPlus_: 1; //!< VI and post VI features + uint aiPlus_: 1; //!< AI and post AI features uint threadTraceEnable_: 1; //!< Thread trace enable uint linearPersistentImage_: 1; //!< Allocates linear images in persistent uint useSingleScratch_: 1; //!< Allocates single scratch per device - uint hsail_: 1; //!< Enables HSAIL compilation uint stagingWritePersistent_: 1; //!< Enables persistent writes - uint svmAtomics_: 1; //!< SVM device atomics - uint svmFineGrainSystem_: 1; //!< SVM fine grain system support - uint apuSystem_: 1; //!< Device is APU system with shared memory - uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL - uint useDeviceQueue_: 1; //!< Submit to separate device queue - uint singleFpDenorm_: 1; //!< Support Single FP Denorm - uint reserved_: 5; + uint svmAtomics_: 1; //!< SVM device atomics + uint svmFineGrainSystem_: 1; //!< SVM fine grain system support + uint apuSystem_: 1; //!< Device is APU system with shared memory + uint useDeviceQueue_: 1; //!< Submit to separate device queue + uint singleFpDenorm_: 1; //!< Support Single FP Denorm + uint sdamPageFaultWar_: 1; //!< SDAM page fault workaround + uint reserved_: 7; }; uint value_; }; uint oclVersion_; //!< Reported OpenCL version support uint debugFlags_; //!< Debug GPU flags - size_t stagedXferSize_; //!< Staged buffer size uint maxRenames_; //!< Maximum number of possible renames uint maxRenameSize_; //!< Maximum size for all renames uint hwLDSSize_; //!< HW local data store size uint maxWorkGroupSize_; //!< Requested workgroup size for this device - uint hostMemDirectAccess_; //!< Enables direct access to the host memory - amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler uint workloadSplitSize_; //!< Workload split size uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms uint blitEngine_; //!< Blit engine type - size_t pinnedXferSize_; //!< Pinned buffer size for transfer - size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer - size_t resourceCacheSize_; //!< Resource cache size in MB - uint64_t maxAllocSize_; //!< Maximum single allocation size - size_t numMemDependencies_;//!< The array size for memory dependencies tracking uint cacheLineSize_; //!< Cache line size in bytes uint cacheSize_; //!< L1 cache size in bytes - size_t xferBufSize_; //!< Transfer buffer size for image copy optimization uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings uint numDeviceEvents_; //!< The number of device events uint numWaitEvents_; //!< The number of wait events for device enqueue + uint hostMemDirectAccess_; //!< Enables direct access to the host memory + size_t xferBufSize_; //!< Transfer buffer size for image copy optimization + size_t stagedXferSize_; //!< Staged buffer size + size_t pinnedXferSize_; //!< Pinned buffer size for transfer + size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer + size_t resourceCacheSize_; //!< Resource cache size in MB + size_t numMemDependencies_;//!< The array size for memory dependencies tracking + uint64_t maxAllocSize_; //!< Maximum single allocation size + amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler //! Default constructor Settings(); diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp index 674a11ac3f..cd1b77bd6a 100644 --- a/rocclr/runtime/utils/flags.hpp +++ b/rocclr/runtime/utils/flags.hpp @@ -139,8 +139,6 @@ release(bool, GPU_HSAIL_ENABLE, LP64_SWITCH(LINUX_SWITCH(false,true),true), \ "Enable HSAIL on dGPU stack (requires CI+ HW)") \ release(uint, GPU_PRINT_CHILD_KERNEL, 0, \ "Prints the specified number of the child kernels") \ -release(bool, GPU_DIRECT_SRD, false, \ - "Use indirect SRD access in HSAIL") \ release(bool, GPU_USE_DEVICE_QUEUE, false, \ "Use a dedicated device queue for the actual submissions") \ release(bool, GPU_ENABLE_LARGE_ALLOCATION, true, \