From ffcdad85e522fc3f8df820165be5e2bbd48d6614 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Thu, 19 May 2016 18:59:20 -0400
Subject: [PATCH] P4 to Git Change 1271191 by gandryey@gera-w8 on 2016/05/19
 18:42:50

	SWDEV-86035 - Add PAL backend to OpenCL
	- Adds SDMA worakround for pagefault
	- Removes directSRD and hsail flags
	- Fixes a BSOD with the latets PAL on Fiji. KMD expects a valid UMD client

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#315 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#225 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#345 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.hpp#96 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/src/core/os/win/winPlatform.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#254 edit
---
 rocclr/runtime/device/gpu/gpukernel.cpp   | 49 +++++-------
 rocclr/runtime/device/gpu/gpuprogram.cpp  |  1 -
 rocclr/runtime/device/gpu/gpusettings.cpp |  3 -
 rocclr/runtime/device/gpu/gpusettings.hpp |  3 +-
 rocclr/runtime/device/pal/paldevice.cpp   | 93 ++++++++++-------------
 rocclr/runtime/device/pal/palkernel.cpp   | 49 +++++-------
 rocclr/runtime/device/pal/palprogram.cpp  |  1 -
 rocclr/runtime/device/pal/palresource.cpp |  8 +-
 rocclr/runtime/device/pal/palsettings.cpp | 18 ++---
 rocclr/runtime/device/pal/palsettings.hpp | 49 ++++++------
 rocclr/runtime/utils/flags.hpp            |  2 -
 11 files changed, 114 insertions(+), 162 deletions(-)

diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index 50f82b4161..4aa8b539ae 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -3889,30 +3889,23 @@ HSAILKernel::loadArguments(
             //! \note syncCache may call DRM transfer
             image->wait(gpu, WaitOnBusyEngine);
 
-            if (dev().settings().hsailDirectSRD_) {
-                // Image arguments are of size 48 bytes and aligned to 16 bytes
-                WriteAqlArg(&aqlArgBuf, image->hwState(),
-                    HsaImageObjectSize, HsaImageObjectAlignment);
+            //! \note Special case for the image views.
+            //! Copy SRD to CB1, so blit manager will be able to release
+            //! this view without a wait for SRD resource.
+            if (image->memoryType() == Resource::ImageView) {
+                // Copy the current structre into CB1
+                memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
+                ConstBuffer* cb = gpu.constBufs_[1];
+                cb->uploadDataToHw(HsaImageObjectSize);
+                // Then use a pointer in aqlArgBuffer to CB1
+                uint64_t srd = cb->vmAddress() + cb->wrtOffset();
+                WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+                memList.push_back(cb);
             }
             else {
-                //! \note Special case for the image views.
-                //! Copy SRD to CB1, so blit manager will be able to release
-                //! this view without a wait for SRD resource.
-                if (image->memoryType() == Resource::ImageView) {
-                    // Copy the current structre into CB1
-                    memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
-                    ConstBuffer* cb = gpu.constBufs_[1];
-                    cb->uploadDataToHw(HsaImageObjectSize);
-                    // Then use a pointer in aqlArgBuffer to CB1
-                    uint64_t srd = cb->vmAddress() + cb->wrtOffset();
-                    WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
-                    memList.push_back(cb);
-                }
-                else {
-                    uint64_t srd = image->hwSrd();
-                    WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
-                    srdResource = true;
-                }
+                uint64_t srd = image->hwSrd();
+                WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+                srdResource = true;
             }
 
             //! @todo Compiler has to return read/write attributes
@@ -3929,15 +3922,9 @@ HSAILKernel::loadArguments(
                 *reinterpret_cast<amd::Sampler* const*>(paramaddr);
             const Sampler* gpuSampler = static_cast<Sampler*>
                     (sampler->getDeviceSampler(dev()));
-            if (dev().settings().hsailDirectSRD_) {
-                WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
-                    HsaSamplerObjectSize, HsaSamplerObjectAlignment);
-            }
-            else {
-                uint64_t srd = gpuSampler->hwSrd();
-                WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
-                srdResource = true;
-            }
+            uint64_t srd = gpuSampler->hwSrd();
+            WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+            srdResource = true;
             break;
         }
         case HSAIL_ARGTYPE_QUEUE: {
diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp
index c9a18defb1..2073b339a8 100644
--- a/rocclr/runtime/device/gpu/gpuprogram.cpp
+++ b/rocclr/runtime/device/gpu/gpuprogram.cpp
@@ -2494,7 +2494,6 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate(
             assert(false);
             return HSA_STATUS_ERROR_INVALID_ARGUMENT;
     }
-    assert(!program_->dev().settings().hsailDirectSRD_);
     gpu::Sampler* sampler = new gpu::Sampler(program_->dev());
     if (!sampler || !sampler->create(state)) {
         delete sampler;
diff --git a/rocclr/runtime/device/gpu/gpusettings.cpp b/rocclr/runtime/device/gpu/gpusettings.cpp
index b9f7f7c628..4084e2df9f 100644
--- a/rocclr/runtime/device/gpu/gpusettings.cpp
+++ b/rocclr/runtime/device/gpu/gpusettings.cpp
@@ -135,9 +135,6 @@ Settings::Settings()
     // Don't support platform atomics by default.
     svmAtomics_ = false;
 
-    // Use direct SRD by default
-    hsailDirectSRD_ = GPU_DIRECT_SRD;
-
     // Use host queue for device enqueuing by default
     useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
 
diff --git a/rocclr/runtime/device/gpu/gpusettings.hpp b/rocclr/runtime/device/gpu/gpusettings.hpp
index 585e2711d8..e3777dc904 100644
--- a/rocclr/runtime/device/gpu/gpusettings.hpp
+++ b/rocclr/runtime/device/gpu/gpusettings.hpp
@@ -69,10 +69,9 @@ public:
             uint    svmFineGrainSystem_: 1;     //!< SVM fine grain system support
             uint    apuSystem_: 1;      //!< Device is APU system with shared memory
             uint    asyncMemCopy_: 1;   //!< Use async memory transfers
-            uint    hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
             uint    useDeviceQueue_: 1; //!< Submit to separate device queue
             uint    singleFpDenorm_: 1; //!< Support Single FP Denorm
-            uint    reserved_: 3;
+            uint    reserved_: 4;
         };
         uint    value_;
     };
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 90c81e2a6a..faa245d9f4 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -149,19 +149,14 @@ NullDevice::create(Pal::GfxIpLevel ipLevel)
 device::Program*
 NullDevice::createProgram(amd::option::Options* options)
 {
-    device::Program* nullProgram;
-    if (settings().hsail_) {
-        nullProgram = new HSAILProgram(*this);
-    }
-    else {
-        // AMDIL path
-        ShouldNotReachHere();
-    }
-    if (nullProgram == nullptr) {
+    device::Program* program;
+    program = new HSAILProgram(*this);
+
+    if (program == nullptr) {
         LogError("Memory allocation has failed!");
     }
 
-    return nullProgram;
+    return program;
 }
 
 void NullDevice::fillDeviceInfo(
@@ -643,19 +638,26 @@ Device::create(Pal::IDevice* device)
     // Update HW info for the device
     hwInfo_ = &DeviceInfo[static_cast<uint>(properties().revision)];
 
+    // Find the number of available engines
+    numComputeEngines_ =
+        properties().engineProperties[Pal::QueueTypeCompute].engineCount;
+    numDmaEngines_ =
+        properties().engineProperties[Pal::QueueTypeDma].engineCount;
+
     Pal::PalPublicSettings*const palSettings = iDev()->GetPublicSettings();
     // Modify settings here
     // palSettings ...
     palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled;
+
     // Commit the new settings for the device
     result = iDev()->CommitSettingsAndInit();
     if (result == Pal::Result::Success) {
         Pal::DeviceFinalizeInfo finalizeInfo = {};
 
-        // Request 2 compute engines
-        finalizeInfo.engineCounts[Pal::QueueTypeCompute] = 2;
-        // Request 2 SDMA engines
-        finalizeInfo.engineCounts[Pal::QueueTypeDma] = 2;
+        // Request all compute engines
+        finalizeInfo.engineCounts[Pal::QueueTypeCompute] = numComputeEngines_;
+        // Request all SDMA engines
+        finalizeInfo.engineCounts[Pal::QueueTypeDma] = numDmaEngines_;
 
         result = iDev()->Finalize(finalizeInfo);
     }
@@ -670,12 +672,6 @@ Device::create(Pal::IDevice* device)
         appProfile_.reportAsOCL12Device())) {
         return false;
     }
-
-    // Find the number of available engines
-    numComputeEngines_ =
-        properties().engineProperties[Pal::QueueTypeCompute].engineCount;
-    numDmaEngines_ =
-        properties().engineProperties[Pal::QueueTypeDma].engineCount;
     numComputeEngines_ = std::min(numComputeEngines_, settings().numComputeRings_);
 
     amd::Context::Info  info = {0};
@@ -816,22 +812,20 @@ Device::initializeHeapResources()
         }
 
         // Delay compilation due to brig_loader memory allocation
-        if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
-            const char* scheduler = nullptr;
-            const char* ocl20 = nullptr;
-            if (settings().oclVersion_ == OpenCL20) {
-                scheduler = SchedulerSourceCode;
-                ocl20 = "-cl-std=CL2.0";
-            }
-            blitProgram_ = new BlitProgram(context_);
-            // Create blit programs
-            if (blitProgram_ == nullptr ||
-                !blitProgram_->create(this, scheduler, ocl20)) {
-                delete blitProgram_;
-                blitProgram_ = nullptr;
-                LogError("Couldn't create blit kernels!");
-                return false;
-            }
+        const char* scheduler = nullptr;
+        const char* ocl20 = nullptr;
+        if (settings().oclVersion_ == OpenCL20) {
+            scheduler = SchedulerSourceCode;
+            ocl20 = "-cl-std=CL2.0";
+        }
+        blitProgram_ = new BlitProgram(context_);
+        // Create blit programs
+        if (blitProgram_ == nullptr ||
+            !blitProgram_->create(this, scheduler, ocl20)) {
+            delete blitProgram_;
+            blitProgram_ = nullptr;
+            LogError("Couldn't create blit kernels!");
+            return false;
         }
 
         // Create a synchronized transfer queue
@@ -900,20 +894,13 @@ Device::createVirtualDevice(
 device::Program*
 Device::createProgram(amd::option::Options* options)
 {
-    device::Program* gpuProgram;
-    if (settings().hsail_) {
-        gpuProgram = new HSAILProgram(*this);
-    }
-    else {
-        ShouldNotReachHere();
-        //AMDIL
-        //gpuProgram = new Program(*this);
-    }
-    if (gpuProgram == nullptr) {
+    device::Program* program;
+    program = new HSAILProgram(*this);
+    if (program == nullptr) {
         LogError("We failed memory allocation for program!");
     }
 
-    return gpuProgram;
+    return program;
 }
 
 //! Requested devices list as configured by the GPU_DEVICE_ORDINAL
@@ -1410,14 +1397,12 @@ bool
 Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const
 {
     *sampler = nullptr;
-    if (settings().hsail_ || (settings().oclVersion_ >= OpenCL20)) {
-        Sampler* gpuSampler = new Sampler(*this);
-        if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) {
-            delete gpuSampler;
-            return false;
-        }
-        *sampler = gpuSampler;
+    Sampler* gpuSampler = new Sampler(*this);
+    if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) {
+        delete gpuSampler;
+        return false;
     }
+    *sampler = gpuSampler;
     return true;
 }
 
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index ea82fd075a..5334e28931 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -1063,30 +1063,23 @@ HSAILKernel::loadArguments(
             //! \note syncCache may call DRM transfer
             image->wait(gpu, WaitOnBusyEngine);
 
-            if (dev().settings().hsailDirectSRD_) {
-                // Image arguments are of size 48 bytes and aligned to 16 bytes
-                WriteAqlArg(&aqlArgBuf, image->hwState(),
-                    HsaImageObjectSize, HsaImageObjectAlignment);
+            //! \note Special case for the image views.
+            //! Copy SRD to CB1, so blit manager will be able to release
+            //! this view without a wait for SRD resource.
+            if (image->memoryType() == Resource::ImageView) {
+                // Copy the current structre into CB1
+                memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
+                ConstBuffer* cb = gpu.constBufs_[1];
+                cb->uploadDataToHw(HsaImageObjectSize);
+                // Then use a pointer in aqlArgBuffer to CB1
+                uint64_t srd = cb->vmAddress() + cb->wrtOffset();
+                WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+                memList.push_back(cb);
             }
             else {
-                //! \note Special case for the image views.
-                //! Copy SRD to CB1, so blit manager will be able to release
-                //! this view without a wait for SRD resource.
-                if (image->memoryType() == Resource::ImageView) {
-                    // Copy the current structre into CB1
-                    memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
-                    ConstBuffer* cb = gpu.constBufs_[1];
-                    cb->uploadDataToHw(HsaImageObjectSize);
-                    // Then use a pointer in aqlArgBuffer to CB1
-                    uint64_t srd = cb->vmAddress() + cb->wrtOffset();
-                    WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
-                    memList.push_back(cb);
-                }
-                else {
-                    uint64_t srd = image->hwSrd();
-                    WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
-                    srdResource = true;
-                }
+                uint64_t srd = image->hwSrd();
+                WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+                srdResource = true;
             }
 
             //! @todo Compiler has to return read/write attributes
@@ -1103,15 +1096,9 @@ HSAILKernel::loadArguments(
                 *reinterpret_cast<amd::Sampler* const*>(paramaddr);
             const Sampler* gpuSampler = static_cast<Sampler*>
                     (sampler->getDeviceSampler(dev()));
-            if (dev().settings().hsailDirectSRD_) {
-                WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
-                    HsaSamplerObjectSize, HsaSamplerObjectAlignment);
-            }
-            else {
-                uint64_t srd = gpuSampler->hwSrd();
-                WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
-                srdResource = true;
-            }
+            uint64_t srd = gpuSampler->hwSrd();
+            WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
+            srdResource = true;
             break;
         }
         case HSAIL_ARGTYPE_QUEUE: {
diff --git a/rocclr/runtime/device/pal/palprogram.cpp b/rocclr/runtime/device/pal/palprogram.cpp
index d677959ff9..1535faf103 100644
--- a/rocclr/runtime/device/pal/palprogram.cpp
+++ b/rocclr/runtime/device/pal/palprogram.cpp
@@ -844,7 +844,6 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate(
             assert(false);
             return HSA_STATUS_ERROR_INVALID_ARGUMENT;
     }
-    assert(!program_->dev().settings().hsailDirectSRD_);
     pal::Sampler* sampler = new pal::Sampler(program_->dev());
     if (!sampler || !sampler->create(state)) {
         delete sampler;
diff --git a/rocclr/runtime/device/pal/palresource.cpp b/rocclr/runtime/device/pal/palresource.cpp
index f131de6adf..487ea8be41 100644
--- a/rocclr/runtime/device/pal/palresource.cpp
+++ b/rocclr/runtime/device/pal/palresource.cpp
@@ -1178,7 +1178,9 @@ Resource::partialMemCopyTo(
         // Make sure linear pitch in bytes is 4 bytes aligned
         if (((copyRegion.gpuMemoryRowPitch % 4) != 0) ||
             // another DRM restriciton... SI has 4 pixels
-            (copyRegion.gpuMemoryOffset % 4 != 0)) {
+            (copyRegion.gpuMemoryOffset % 4 != 0) ||
+            (dev().settings().sdamPageFaultWar_ && 
+             (copyRegion.imageOffset.x % dstResource.elementSize() != 0))) {
             result = false;
         }
         else {
@@ -1204,7 +1206,9 @@ Resource::partialMemCopyTo(
         // Make sure linear pitch in bytes is 4 bytes aligned
         if (((copyRegion.gpuMemoryRowPitch % 4) != 0) ||
             // another DRM restriciton... SI has 4 pixels
-            (copyRegion.gpuMemoryOffset % 4 != 0)) {
+            (copyRegion.gpuMemoryOffset % 4 != 0) ||
+            (dev().settings().sdamPageFaultWar_ &&
+             (copyRegion.imageOffset.x % elementSize() != 0))) {
             result = false;
         }
         else {
diff --git a/rocclr/runtime/device/pal/palsettings.cpp b/rocclr/runtime/device/pal/palsettings.cpp
index 2a239d03e3..86c5542fd4 100644
--- a/rocclr/runtime/device/pal/palsettings.cpp
+++ b/rocclr/runtime/device/pal/palsettings.cpp
@@ -115,20 +115,17 @@ Settings::Settings()
     numDeviceEvents_ = 1024;
     numWaitEvents_   = 8;
 
-    // Disable HSAIL by default
-    hsail_ = false;
-
     // Don't support platform atomics by default.
     svmAtomics_ = false;
 
-    // Use direct SRD by default
-    hsailDirectSRD_ = GPU_DIRECT_SRD;
-
     // Use host queue for device enqueuing by default
     useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
 
     // Don't support Denormals for single precision by default
     singleFpDenorm_ = false;
+
+    // Disable SDMA workaround by default
+    sdamPageFaultWar_ = false;
 }
 
 bool
@@ -179,6 +176,9 @@ Settings::create(
         // Keep this false even though we have support
         // singleFpDenorm_ = true;
         viPlus_ = true;
+        // SDMA may have memory access outside of
+        // the valid buffer range and cause a page fault
+        sdamPageFaultWar_ = true;
         // Fall through to CI ...
     case Pal::AsicRevision::Kalindi:
     case Pal::AsicRevision::Spectre:
@@ -193,7 +193,6 @@ Settings::create(
     case Pal::AsicRevision::Bonaire:
     case Pal::AsicRevision::Hawaii:
         ciPlus_ = true;
-        hsail_ = true;
         threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE;
         reportFMAF_ = false;
         if (palProp.revision == Pal::AsicRevision::Hawaii) {
@@ -228,11 +227,10 @@ Settings::create(
         // This needs to be cleaned once 64bit addressing is stable
         if (oclVersion_ < OpenCL20) {
             use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) ? LP64_SWITCH(false,
-                /*calAttr.isWorkstation ||*/ hsail_) : GPU_FORCE_64BIT_PTR;
+                /*calAttr.isWorkstation ||*/ true) : GPU_FORCE_64BIT_PTR;
         }
         else {
-            if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
-                || (oclVersion_ >= OpenCL20)))) {
+            if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) {
                 use64BitPtr_    = true;
             }
         }
diff --git a/rocclr/runtime/device/pal/palsettings.hpp b/rocclr/runtime/device/pal/palsettings.hpp
index 4aea512f7c..50a27a9a43 100644
--- a/rocclr/runtime/device/pal/palsettings.hpp
+++ b/rocclr/runtime/device/pal/palsettings.hpp
@@ -50,56 +50,55 @@ public:
             uint    disablePersistent_: 1;  //!< Disables using persistent memory for staging
             uint    imageSupport_: 1;       //!< Report images support
             uint    doublePrecision_: 1;    //!< Enables double precision support
-            uint    reportFMAF_: 1;     //!< Report FP_FAST_FMAF define in CL program
-            uint    reportFMA_: 1;      //!< Report FP_FAST_FMA define in CL program
-            uint    use64BitPtr_: 1;    //!< Use 64bit pointers on GPU
+            uint    reportFMAF_: 1;         //!< Report FP_FAST_FMAF define in CL program
+            uint    reportFMA_: 1;          //!< Report FP_FAST_FMA define in CL program
+            uint    use64BitPtr_: 1;        //!< Use 64bit pointers on GPU
             uint    force32BitOcl20_: 1;    //!< Force 32bit apps to take CLANG/HSAIL path on GPU
-            uint    imageDMA_: 1;       //!< Enable direct image DMA transfers
-            uint    syncObject_: 1;     //!< Enable syncobject
-            uint    ciPlus_: 1;         //!< CI and post CI features
-            uint    viPlus_: 1;         //!< VI and post VI features
-            uint    aiPlus_: 1;         //!< AI and post AI features
+            uint    imageDMA_: 1;           //!< Enable direct image DMA transfers
+            uint    syncObject_: 1;         //!< Enable syncobject
+            uint    ciPlus_: 1;             //!< CI and post CI features
+            uint    viPlus_: 1;             //!< VI and post VI features
+            uint    aiPlus_: 1;             //!< AI and post AI features
             uint    threadTraceEnable_: 1;  //!< Thread trace enable
             uint    linearPersistentImage_: 1;  //!< Allocates linear images in persistent
             uint    useSingleScratch_: 1;   //!< Allocates single scratch per device
-            uint    hsail_: 1;          //!< Enables HSAIL compilation
             uint    stagingWritePersistent_: 1; //!< Enables persistent writes
-            uint    svmAtomics_: 1;     //!< SVM device atomics
-            uint    svmFineGrainSystem_: 1;     //!< SVM fine grain system support
-            uint    apuSystem_: 1;      //!< Device is APU system with shared memory
-            uint    hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
-            uint    useDeviceQueue_: 1; //!< Submit to separate device queue
-            uint    singleFpDenorm_: 1; //!< Support Single FP Denorm
-            uint    reserved_: 5;
+            uint    svmAtomics_: 1;         //!< SVM device atomics
+            uint    svmFineGrainSystem_: 1; //!< SVM fine grain system support
+            uint    apuSystem_: 1;          //!< Device is APU system with shared memory
+            uint    useDeviceQueue_: 1;     //!< Submit to separate device queue
+            uint    singleFpDenorm_: 1;     //!< Support Single FP Denorm
+            uint    sdamPageFaultWar_: 1;   //!< SDAM page fault workaround
+            uint    reserved_: 7;
         };
         uint    value_;
     };
 
     uint    oclVersion_;        //!< Reported OpenCL version support
     uint    debugFlags_;        //!< Debug GPU flags
-    size_t  stagedXferSize_;    //!< Staged buffer size
     uint    maxRenames_;        //!< Maximum number of possible renames
     uint    maxRenameSize_;     //!< Maximum size for all renames
     uint    hwLDSSize_;         //!< HW local data store size
     uint    maxWorkGroupSize_;  //!< Requested workgroup size for this device
-    uint    hostMemDirectAccess_;   //!< Enables direct access to the host memory
-    amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
     uint    workloadSplitSize_; //!< Workload split size
     uint    minWorkloadTime_;   //!< Minimal workload time in 0.1 ms
     uint    maxWorkloadTime_;   //!< Maximum workload time in 0.1 ms
     uint    blitEngine_;        //!< Blit engine type
-    size_t  pinnedXferSize_;    //!< Pinned buffer size for transfer
-    size_t  pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
-    size_t  resourceCacheSize_; //!< Resource cache size in MB
-    uint64_t    maxAllocSize_;  //!< Maximum single allocation size
-    size_t  numMemDependencies_;//!< The array size for memory dependencies tracking
     uint    cacheLineSize_;     //!< Cache line size in bytes
     uint    cacheSize_;         //!< L1 cache size in bytes
-    size_t  xferBufSize_;       //!< Transfer buffer size for image copy optimization
     uint    numComputeRings_;   //!< 0 - disabled, 1 , 2,.. - the number of compute rings
     uint    numDeviceEvents_;   //!< The number of device events
     uint    numWaitEvents_;     //!< The number of wait events for device enqueue
+    uint    hostMemDirectAccess_;   //!< Enables direct access to the host memory
+    size_t  xferBufSize_;       //!< Transfer buffer size for image copy optimization
+    size_t  stagedXferSize_;    //!< Staged buffer size
+    size_t  pinnedXferSize_;    //!< Pinned buffer size for transfer
+    size_t  pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
+    size_t  resourceCacheSize_; //!< Resource cache size in MB
+    size_t  numMemDependencies_;//!< The array size for memory dependencies tracking
+    uint64_t    maxAllocSize_;  //!< Maximum single allocation size
 
+    amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
 
     //! Default constructor
     Settings();
diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp
index 674a11ac3f..cd1b77bd6a 100644
--- a/rocclr/runtime/utils/flags.hpp
+++ b/rocclr/runtime/utils/flags.hpp
@@ -139,8 +139,6 @@ release(bool, GPU_HSAIL_ENABLE, LP64_SWITCH(LINUX_SWITCH(false,true),true),   \
         "Enable HSAIL on dGPU stack (requires CI+ HW)")                       \
 release(uint, GPU_PRINT_CHILD_KERNEL, 0,                                      \
         "Prints the specified number of the child kernels")                   \
-release(bool, GPU_DIRECT_SRD, false,                                          \
-        "Use indirect SRD access in HSAIL")                                   \
 release(bool, GPU_USE_DEVICE_QUEUE, false,                                    \
         "Use a dedicated device queue for the actual submissions")            \
 release(bool, GPU_ENABLE_LARGE_ALLOCATION, true,                              \