From d7fdd9fcb8f96ebefa7961da492ae64e59867648 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Wed, 15 Feb 2023 07:23:23 +0000
Subject: [PATCH] SWDEV-368235 - Revert "Remove obsolete env variables"

This reverts commit dfa77900301856bcc5c987fcd26aa70d5f1ca12f.

Reason for revert: Deferred to a future release.

Change-Id: Ia66c37f0ab9734dee73c930d10d7469d5fd57254


[ROCm/clr commit: 5dc104b3ea98b6cd9c6b4a227de696a120b7ceee]
---
 projects/clr/rocclr/device/device.cpp         |  11 ++
 projects/clr/rocclr/device/device.hpp         |   3 +-
 projects/clr/rocclr/device/devkernel.cpp      | 150 ++++++++++--------
 projects/clr/rocclr/device/pal/paldevice.cpp  |   4 +-
 .../clr/rocclr/device/pal/palsettings.cpp     |  39 ++++-
 .../clr/rocclr/device/pal/palsettings.hpp     |   4 +-
 projects/clr/rocclr/device/pal/palvirtual.cpp |  25 ++-
 projects/clr/rocclr/device/rocm/rocdevice.cpp |  58 +++----
 projects/clr/rocclr/device/rocm/rocdevice.hpp |   2 +-
 .../clr/rocclr/device/rocm/rocsettings.cpp    |  26 +++
 .../clr/rocclr/device/rocm/rocsettings.hpp    |  13 +-
 projects/clr/rocclr/utils/flags.hpp           |  44 +++++
 12 files changed, 273 insertions(+), 106 deletions(-)

diff --git a/projects/clr/rocclr/device/device.cpp b/projects/clr/rocclr/device/device.cpp
index 288f84f616..f87452dbd5 100644
--- a/projects/clr/rocclr/device/device.cpp
+++ b/projects/clr/rocclr/device/device.cpp
@@ -798,6 +798,17 @@ Settings::Settings() : value_(0) {
   commandQueues_ = 200;  //!< Field value set to maximum number
                          //!< concurrent Virtual GPUs for default
 
+  overrideLclSet = (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE)) ? 1 : 0;
+  overrideLclSet |=
+      (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y))
+      ? 2
+      : 0;
+  overrideLclSet |=
+      (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
+       !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z))
+      ? 4
+      : 0;
+
   fenceScopeAgent_ = AMD_OPT_FLUSH;
   if (amd::IS_HIP) {
     if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) {
diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index ed206311fb..a493795e18 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -619,6 +619,7 @@ class Settings : public amd::HeapObject {
   uint64_t extensions_;  //!< Supported OCL extensions
   union {
     struct {
+      uint overrideLclSet : 3;        //!< Bit mask to override the local size
       uint apuSystem_ : 1;            //!< Device is APU system with shared memory
       uint supportRA_ : 1;            //!< Support RA channel order format
       uint waitCommand_ : 1;          //!< Enables a wait for every submitted command
@@ -638,7 +639,7 @@ class Settings : public amd::HeapObject {
       uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device
       uint fenceScopeAgent_ : 1;      //!< Enable fence scope agent in AQL dispatch packet
       uint rocr_backend_ : 1;         //!< Device uses ROCr backend for submissions
-      uint reserved_ : 14;
+      uint reserved_ : 11;
     };
     uint value_;
   };
diff --git a/projects/clr/rocclr/device/devkernel.cpp b/projects/clr/rocclr/device/devkernel.cpp
index 42eb185741..e598dca74f 100644
--- a/projects/clr/rocclr/device/devkernel.cpp
+++ b/projects/clr/rocclr/device/devkernel.cpp
@@ -677,78 +677,98 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
   if (workGroupInfo()->compileSize_[0] == 0) {
     // Find the default local workgroup size, if it wasn't specified
     if (lclWorkSize[0] == 0) {
-      // Find threads per group
-      size_t thrPerGrp = workGroupInfo()->size_;
+      if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
+        // Find threads per group
+        size_t thrPerGrp = workGroupInfo()->size_;
 
-      // Check if kernel uses images
-      if (flags_.imageEna_ &&
-        // and thread group is a multiple value of wavefronts
-        ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
-        // and it's 2 or 3-dimensional workload
-        (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
-        // Use 8x8 workgroup size if kernel has image writes
-        if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
-          lclWorkSize[0] = 8;
-          lclWorkSize[1] = 8;
+        // Check if kernel uses images
+        if (flags_.imageEna_ &&
+          // and thread group is a multiple value of wavefronts
+          ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
+          // and it's 2 or 3-dimensional workload
+          (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
+          // Use 8x8 workgroup size if kernel has image writes
+          if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
+            lclWorkSize[0] = 8;
+            lclWorkSize[1] = 8;
+          }
+          else {
+            lclWorkSize[0] = 16;
+            lclWorkSize[1] = 16;
+          }
+          if (workDim == 3) {
+            lclWorkSize[2] = 1;
+          }
         }
         else {
-          lclWorkSize[0] = 16;
-          lclWorkSize[1] = 16;
-        }
-        if (workDim == 3) {
-          lclWorkSize[2] = 1;
+          size_t tmp = thrPerGrp;
+          // Split the local workgroup into the most efficient way
+          for (uint d = 0; d < workDim; ++d) {
+            size_t div = tmp;
+            for (; (gblWorkSize[d] % div) != 0; div--)
+              ;
+            lclWorkSize[d] = div;
+            tmp /= div;
+          }
+
+          // Assuming DWORD access
+          const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
+
+          // Check if we couldn't find optimal workload
+          if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
+              // or size is too small for the cache line
+            (lclWorkSize[0] < cacheLineMatch)) {
+            size_t maxSize = 0;
+            size_t maxDim = 0;
+            for (uint d = 0; d < workDim; ++d) {
+              if (maxSize < gblWorkSize[d]) {
+                maxSize = gblWorkSize[d];
+                maxDim = d;
+              }
+            }
+            // Use X dimension as high priority. Runtime will assume that
+            // X dimension is more important for the address calculation
+            if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
+              lclWorkSize[0] = cacheLineMatch;
+              thrPerGrp /= cacheLineMatch;
+              lclWorkSize[maxDim] = thrPerGrp;
+              for (uint d = 1; d < workDim; ++d) {
+                if (d != maxDim) {
+                  lclWorkSize[d] = 1;
+                }
+              }
+            }
+            else {
+              // Check if a local workgroup has the most optimal size
+              if (thrPerGrp > maxSize) {
+                thrPerGrp = maxSize;
+              }
+              lclWorkSize[maxDim] = thrPerGrp;
+              for (uint d = 0; d < workDim; ++d) {
+                if (d != maxDim) {
+                  lclWorkSize[d] = 1;
+                }
+              }
+            }
+          }
         }
       }
       else {
-        size_t tmp = thrPerGrp;
-        // Split the local workgroup into the most efficient way
-        for (uint d = 0; d < workDim; ++d) {
-          size_t div = tmp;
-          for (; (gblWorkSize[d] % div) != 0; div--)
-            ;
-          lclWorkSize[d] = div;
-          tmp /= div;
+        // Use overrides when app doesn't provide workgroup dimensions
+        if (workDim == 1) {
+          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
         }
-
-        // Assuming DWORD access
-        const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
-
-        // Check if we couldn't find optimal workload
-        if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
-            // or size is too small for the cache line
-          (lclWorkSize[0] < cacheLineMatch)) {
-          size_t maxSize = 0;
-          size_t maxDim = 0;
-          for (uint d = 0; d < workDim; ++d) {
-            if (maxSize < gblWorkSize[d]) {
-              maxSize = gblWorkSize[d];
-              maxDim = d;
-            }
-          }
-          // Use X dimension as high priority. Runtime will assume that
-          // X dimension is more important for the address calculation
-          if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
-            lclWorkSize[0] = cacheLineMatch;
-            thrPerGrp /= cacheLineMatch;
-            lclWorkSize[maxDim] = thrPerGrp;
-            for (uint d = 1; d < workDim; ++d) {
-              if (d != maxDim) {
-                lclWorkSize[d] = 1;
-              }
-            }
-          }
-          else {
-            // Check if a local workgroup has the most optimal size
-            if (thrPerGrp > maxSize) {
-              thrPerGrp = maxSize;
-            }
-            lclWorkSize[maxDim] = thrPerGrp;
-            for (uint d = 0; d < workDim; ++d) {
-              if (d != maxDim) {
-                lclWorkSize[d] = 1;
-              }
-            }
-          }
+        else if (workDim == 2) {
+          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
+          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
+        }
+        else if (workDim == 3) {
+          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
+          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
+          lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
+        }
+        else {
+          assert(0 && "Invalid workDim!");
         }
       }
     }
diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp
index 5755f34a81..8b07d3914c 100644
--- a/projects/clr/rocclr/device/pal/paldevice.cpp
+++ b/projects/clr/rocclr/device/pal/paldevice.cpp
@@ -291,7 +291,7 @@ bool NullDevice::create(const char* palName, const amd::Isa& isa, Pal::GfxIpLeve
                                nullptr,
                                nullptr,
                                nullptr,
-                               nullptr};
+                               AMD_OCL_SC_LIB};
     // Initialize the compiler handle
     acl_error error;
     compiler_ = amd::Hsail::CompilerInit(&opts, &error);
@@ -1013,7 +1013,7 @@ bool Device::create(Pal::IDevice* device) {
                                nullptr,
                                nullptr,
                                nullptr,
-                               nullptr};
+                               AMD_OCL_SC_LIB};
     // Initialize the compiler handle
     acl_error error;
     compiler_ = amd::Hsail::CompilerInit(&opts, &error);
diff --git a/projects/clr/rocclr/device/pal/palsettings.cpp b/projects/clr/rocclr/device/pal/palsettings.cpp
index 59ac82b8ea..1f4dc8e908 100644
--- a/projects/clr/rocclr/device/pal/palsettings.cpp
+++ b/projects/clr/rocclr/device/pal/palsettings.cpp
@@ -76,6 +76,8 @@ Settings::Settings() {
   // Enable workload split by default (for 24 bit arithmetic or timeout)
   workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT;
 
+  // By default use host blit
+  blitEngine_ = BlitEngineHost;
   pinnedXferSize_ = GPU_PINNED_MIN_XFER_SIZE * Mi;
   pinnedMinXferSize_ = flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)
     ? 128 * Mi : GPU_PINNED_MIN_XFER_SIZE * Mi;
@@ -121,6 +123,8 @@ Settings::Settings() {
   //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
   linearPersistentImage_ = false;
 
+  useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
+
   // Device enqueuing settings
   numDeviceEvents_ = 1024;
   numWaitEvents_ = 8;
@@ -324,11 +328,16 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
 
       libSelector_ = amd::GPU_Library_CI;
       if (LP64_SWITCH(false, true)) {
-        oclVersion_ = !reportAsOCL12Device
+        oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
+            ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
+            : OpenCL12;
+      }
+      if (GPU_FORCE_OCL20_32BIT) {
+        force32BitOcl20_ = true;
+        oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
             ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
             : OpenCL12;
       }
-
       if (OPENCL_VERSION < 200) {
         oclVersion_ = OpenCL12;
       }
@@ -337,13 +346,27 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
       // Cap at OpenCL20 for now
       if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20;
 
-      use64BitPtr_ = LP64_SWITCH(false, true);
+      // This needs to be cleaned once 64bit addressing is stable
+      if (oclVersion_ < OpenCL20) {
+        use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR)
+            ? LP64_SWITCH(false,
+                          /*calAttr.isWorkstation ||*/ true)
+            : GPU_FORCE_64BIT_PTR;
+      } else {
+        if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) {
+          use64BitPtr_ = true;
+        }
+      }
 
       if (oclVersion_ >= OpenCL20) {
         supportDepthsRGB_ = true;
       }
       if (use64BitPtr_) {
-        maxAllocSize_ = 64ULL * Gi;
+        if (GPU_ENABLE_LARGE_ALLOCATION) {
+          maxAllocSize_ = 64ULL * Gi;
+        } else {
+          maxAllocSize_ = 4048 * Mi;
+        }
       } else {
         maxAllocSize_ = 3ULL * Gi;
       }
@@ -424,6 +447,9 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
 
   imageSupport_ = true;
 
+  // Use kernels for blit if appropriate
+  blitEngine_ = BlitEngineKernel;
+
   hostMemDirectAccess_ |= HostMemBuffer;
   // HW doesn't support untiled image writes
   // hostMemDirectAccess_ |= HostMemImage;
@@ -516,6 +542,11 @@ void Settings::override() {
     preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
   }
 
+  // Override blit engine type
+  if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) {
+    blitEngine_ = GPU_BLIT_ENGINE_TYPE;
+  }
+
   if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
     debugFlags_ = DEBUG_GPU_FLAGS;
   }
diff --git a/projects/clr/rocclr/device/pal/palsettings.hpp b/projects/clr/rocclr/device/pal/palsettings.hpp
index 4bcc7e2e9b..66984622ee 100644
--- a/projects/clr/rocclr/device/pal/palsettings.hpp
+++ b/projects/clr/rocclr/device/pal/palsettings.hpp
@@ -70,6 +70,7 @@ class Settings : public device::Settings {
       uint gfx10Plus_ : 1;              //!< gfx10 and post gfx10 features
       uint threadTraceEnable_ : 1;      //!< Thread trace enable
       uint linearPersistentImage_ : 1;  //!< Allocates linear images in persistent
+      uint useSingleScratch_ : 1;       //!< Allocates single scratch per device
       uint svmAtomics_ : 1;             //!< SVM device atomics
       uint svmFineGrainSystem_ : 1;     //!< SVM fine grain system support
       uint useDeviceQueue_ : 1;         //!< Submit to separate device queue
@@ -81,7 +82,7 @@ class Settings : public device::Settings {
       uint imageBufferWar_ : 1;         //!< Image buffer workaround for Gfx10
       uint disableSdma_ : 1;            //!< Disable SDMA support
       uint alwaysResident_ : 1;         //!< Make resources resident at allocation time
-      uint reserved_ : 8;
+      uint reserved_ : 7;
     };
     uint value_;
   };
@@ -94,6 +95,7 @@ class Settings : public device::Settings {
   uint workloadSplitSize_;       //!< Workload split size
   uint minWorkloadTime_;         //!< Minimal workload time in 0.1 ms
   uint maxWorkloadTime_;         //!< Maximum workload time in 0.1 ms
+  uint blitEngine_;              //!< Blit engine type
   uint cacheLineSize_;           //!< Cache line size in bytes
   uint cacheSize_;               //!< L1 cache size in bytes
   uint numComputeRings_;         //!< 0 - disabled, 1 , 2,.. - the number of compute rings
diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp
index 3a2081712a..624379d104 100644
--- a/projects/clr/rocclr/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/device/pal/palvirtual.cpp
@@ -905,6 +905,11 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
   // because destructor calls eraseResourceList() even if create() failed
   dev().resizeResoureList(index());
 
+  if (index() >= GPU_MAX_COMMAND_QUEUES) {
+    // Cap the maximum number of concurrent Virtual GPUs
+    return false;
+  }
+
   // Virtual GPU will have profiling enabled
   state_.profiling_ = profiling;
 
@@ -1015,7 +1020,18 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
     return false;
   }
 
-  blitMgr_ = new KernelBlitManager(*this, blitSetup);
+  // Choose the appropriate class for blit engine
+  switch (dev().settings().blitEngine_) {
+    default:
+    // Fall through ...
+    case Settings::BlitEngineHost:
+      blitSetup.disableAll();
+    // Fall through ...
+    case Settings::BlitEngineCAL:
+    case Settings::BlitEngineKernel:
+      blitMgr_ = new KernelBlitManager(*this, blitSetup);
+      break;
+  }
   if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
     LogError("Could not create BlitManager!");
     return false;
@@ -3253,8 +3269,11 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) {
       cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);
 
       uint64_t endTimeStampCPU = amd::Os::timeNanos();
-      // Adjust the base time by the execution time
-      readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
+      // Make sure the command batch has a valid GPU TS
+      if (!GPU_RAW_TIMESTAMP) {
+        // Adjust the base time by the execution time
+        readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
+      }
     }
   }
 }
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 95a4d375f0..2151467103 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -894,35 +894,37 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
   Device* dev = reinterpret_cast<Device*>(data);
   switch (segment_type) {
     case HSA_REGION_SEGMENT_GLOBAL: {
-      uint32_t global_flag = 0;
-      hsa_status_t stat =
-          hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
-      if (stat != HSA_STATUS_SUCCESS) {
-        return stat;
-      }
-
-      if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) {
-        dev->gpu_fine_grained_segment_ = pool;
-      } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
-        dev->gpuvm_segment_ = pool;
-
-        // If cpu agent cannot access this pool, the device does not support large bar.
-        hsa_amd_memory_pool_access_t tmp{};
-        hsa_amd_agent_memory_pool_get_info(
-          dev->cpu_agent_,
-          pool,
-          HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
-          &tmp);
-
-        if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
-          dev->info_.largeBar_ = false;
-        } else {
-          dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR;
+      if (dev->settings().enableLocalMemory_) {
+        uint32_t global_flag = 0;
+        hsa_status_t stat =
+            hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
+        if (stat != HSA_STATUS_SUCCESS) {
+          return stat;
         }
-      }
 
-      if (dev->gpuvm_segment_.handle == 0) {
-        dev->gpuvm_segment_ = pool;
+        if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) {
+          dev->gpu_fine_grained_segment_ = pool;
+        } else if ((global_flag & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) != 0) {
+          dev->gpuvm_segment_ = pool;
+
+          // If cpu agent cannot access this pool, the device does not support large bar.
+          hsa_amd_memory_pool_access_t tmp{};
+          hsa_amd_agent_memory_pool_get_info(
+            dev->cpu_agent_,
+            pool,
+            HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
+            &tmp);
+
+          if (tmp == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
+            dev->info_.largeBar_ = false;
+          } else {
+            dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR;
+          }
+        }
+
+        if (dev->gpuvm_segment_.handle == 0) {
+          dev->gpuvm_segment_ = pool;
+        }
       }
       break;
     }
@@ -1232,7 +1234,7 @@ bool Device::populateOCLDeviceConstants() {
 
   info_.maxWorkItemDimensions_ = 3;
 
-  if (gpuvm_segment_.handle != 0) {
+  if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) {
     size_t global_segment_size = 0;
     if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_,
                                                            HSA_AMD_MEMORY_POOL_INFO_SIZE,
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index 275fe45761..3fcbf0391e 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -218,7 +218,7 @@ class NullDevice : public amd::Device {
 
   //! Determine if we can use device memory for SVM
   const bool forceFineGrain(amd::Memory* memory) const {
-    return (memory->getContext().devices().size() > 1);
+    return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1);
   }
 
   virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle) {
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp
index 24cb88dcbd..71c341ad43 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.cpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp
@@ -35,9 +35,18 @@ Settings::Settings() {
   // Set this to true when we drop the flag
   doublePrecision_ = ::CL_KHR_FP64;
 
+  enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE;
+  enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM;
+
   maxWorkGroupSize_ = 1024;
   preferredWorkGroupSize_ = 256;
 
+  maxWorkGroupSize2DX_ = 16;
+  maxWorkGroupSize2DY_ = 16;
+  maxWorkGroupSize3DX_ = 4;
+  maxWorkGroupSize3DY_ = 4;
+  maxWorkGroupSize3DZ_ = 4;
+
   kernargPoolSize_ = HSA_KERNARG_POOL_SIZE;
 
   // Determine if user is requesting Non-Coherent mode
@@ -192,6 +201,23 @@ void Settings::override() {
     preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
   }
 
+  if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) {
+    maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X;
+  }
+  if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) {
+    maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y;
+  }
+
+  if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) {
+    maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X;
+  }
+  if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) {
+    maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y;
+  }
+  if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) {
+    maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z;
+  }
+
   if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
     xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
   }
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp
index 5b5f81d7d7..d2fffd73db 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.hpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp
@@ -42,6 +42,8 @@ class Settings : public device::Settings {
   union {
     struct {
       uint doublePrecision_ : 1;        //!< Enables double precision support
+      uint enableLocalMemory_ : 1;      //!< Enable GPUVM memory
+      uint enableCoarseGrainSVM_ : 1;   //!< Enable device memory for coarse grain SVM allocations
       uint enableNCMode_ : 1;           //!< Enable Non Coherent mode for system memory
       uint imageDMA_ : 1;               //!< Enable direct image DMA transfers
       uint stagedXferRead_ : 1;         //!< Uses a staged buffer read
@@ -53,7 +55,7 @@ class Settings : public device::Settings {
       uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
       uint coop_sync_ : 1;              //!< grid and multi-grid sync for gfx940+
       uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
-      uint reserved_ : 20;
+      uint reserved_ : 18;
     };
     uint value_;
   };
@@ -64,6 +66,15 @@ class Settings : public device::Settings {
   //! Preferred workgroup size
   uint preferredWorkGroupSize_;
 
+  //! Default max workgroup sizes for 2D
+  int maxWorkGroupSize2DX_;
+  int maxWorkGroupSize2DY_;
+
+  //! Default max workgroup sizes for 3D
+  int maxWorkGroupSize3DX_;
+  int maxWorkGroupSize3DY_;
+  int maxWorkGroupSize3DZ_;
+
   uint kernargPoolSize_;
   uint numDeviceEvents_;      //!< The number of device events
   uint numWaitEvents_;        //!< The number of wait events for device enqueue
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index f563c6d3bc..186d059b41 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -30,10 +30,22 @@ release(uint, AMD_LOG_MASK, 0X7FFFFFFF,                                       \
         "The mask to enable specific kinds of logs")                          \
 debug(uint, DEBUG_GPU_FLAGS, 0,                                               \
         "The debug options for GPU device")                                   \
+release(uint, GPU_MAX_COMMAND_QUEUES, 300,                                    \
+        "The maximum number of concurrent Virtual GPUs")                      \
 release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */         \
         "The default command queue thread stack size")                        \
 release(int, GPU_MAX_WORKGROUP_SIZE, 0,                                       \
         "Maximum number of workitems in a workgroup for GPU, 0 -use default") \
+release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0,                                  \
+        "Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \
+release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0,                                  \
+        "Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \
+release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0,                                  \
+        "Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \
+release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0,                                  \
+        "Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \
+release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0,                                  \
+        "Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \
 debug(bool, CPU_MEMORY_GUARD_PAGES, false,                                    \
         "Use guard pages for CPU memory")                                     \
 debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64,                                 \
@@ -58,8 +70,12 @@ release(uint, GPU_STAGING_BUFFER_SIZE, 4,                                     \
         "Size of the GPU staging buffer in MiB")                              \
 release(bool, GPU_DUMP_BLIT_KERNELS, false,                                   \
         "Dump the kernels for blit manager")                                  \
+release(uint, GPU_BLIT_ENGINE_TYPE, 0x0,                                      \
+        "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel")       \
 release(bool, GPU_FLUSH_ON_EXECUTION, false,                                  \
         "Submit commands to HW on every operation. 0 - Disable, 1 - Enable")  \
+release(bool, GPU_USE_SYNC_OBJECTS, true,                                     \
+        "If enabled, use sync objects instead of polling")                    \
 release(bool, CL_KHR_FP64, true,                                              \
         "Enable/Disable support for double precision")                        \
 release(cstring, AMD_OCL_BUILD_OPTIONS, 0,                                    \
@@ -70,8 +86,12 @@ release(cstring, AMD_OCL_LINK_OPTIONS, 0,                                     \
         "Set clLinkProgram()'s options (override)")                           \
 release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0,                              \
         "Append clLinkProgram()'s options")                                   \
+release(cstring, AMD_OCL_SC_LIB, 0,                                           \
+        "Set shader compiler shared library name or path")                    \
 debug(cstring, AMD_OCL_SUBST_OBJFILE, 0,                                      \
         "Specify binary substitution config file for OpenCL")                 \
+debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false,                                \
+        "Enable the error dialog on Windows")                                 \
 release(size_t, GPU_PINNED_XFER_SIZE, 32,                                     \
         "The pinned buffer size for pinning in read/write transfers in MiB")  \
 release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128,                                \
@@ -80,6 +100,12 @@ release(size_t, GPU_RESOURCE_CACHE_SIZE, 64,                                  \
         "The resource cache size in MB")                                      \
 release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096,                                  \
         "The maximum size accepted for suballocaitons in KB")                 \
+release(bool, GPU_FORCE_64BIT_PTR, 0,                                         \
+        "Forces 64 bit pointers on GPU")                                      \
+release(bool, GPU_FORCE_OCL20_32BIT, 0,                                       \
+        "Forces 32 bit apps to take CLANG\HSAIL path")                        \
+release(bool, GPU_RAW_TIMESTAMP, 0,                                           \
+        "Reports GPU raw timestamps in GPU timeline")                         \
 release(size_t, GPU_NUM_MEM_DEPENDENCY, 256,                                  \
         "Number of memory objects for dependency tracking")                   \
 release(size_t, GPU_XFER_BUFFER_SIZE, 0,                                      \
@@ -90,20 +116,32 @@ release(uint, GPU_SINGLE_ALLOC_PERCENT, 85,                                   \
         "Maximum size of a single allocation as percentage of total")         \
 release(uint, GPU_NUM_COMPUTE_RINGS, 2,                                       \
         "GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \
+release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1,                                 \
+        "GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \
 release(uint, GPU_WORKLOAD_SPLIT, 22,                                         \
         "Workload split size")                                                \
+release(bool, GPU_USE_SINGLE_SCRATCH, false,                                  \
+        "Use single scratch buffer per device instead of per HW ring")        \
 release(bool, AMD_OCL_WAIT_COMMAND, false,                                    \
         "1 = Enable a wait for every submitted command")                      \
 release(uint, GPU_PRINT_CHILD_KERNEL, 0,                                      \
         "Prints the specified number of the child kernels")                   \
 release(bool, GPU_USE_DEVICE_QUEUE, false,                                    \
         "Use a dedicated device queue for the actual submissions")            \
+release(bool, GPU_ENABLE_LARGE_ALLOCATION, true,                              \
+        "Enable >4GB single allocations")                                     \
 release(bool, AMD_THREAD_TRACE_ENABLE, true,                                  \
         "Enable thread trace extension")                                      \
 release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200),                        \
         "Force GPU opencl verison")                                           \
+release(bool, HSA_LOCAL_MEMORY_ENABLE, true,                                  \
+        "Enable HSA device local memory usage")                               \
 release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024,                             \
         "Kernarg pool size")                                                  \
+release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true,                              \
+        "Enable device memory for coarse grain SVM allocations")              \
+release(bool, GPU_IFH_MODE, false,                                            \
+        "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \
 release(bool, GPU_MIPMAP, true,                                               \
         "Enables GPU mipmap extension")                                       \
 release(uint, GPU_ENABLE_PAL, 2,                                              \
@@ -114,6 +152,8 @@ release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1,                              \
         "Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \
 release(uint, OCL_SET_SVM_SIZE, 4*16384,                                      \
         "set SVM space size for discrete GPU")                                \
+debug(uint, OCL_SYSMEM_REQUIREMENT, 2,                                        \
+        "Use flag to change the minimum requirement of system memory not to downgrade")        \
 release(uint, GPU_WAVES_PER_SIMD, 0,                                          \
         "Force the number of waves per SIMD (1-10)")                          \
 release(bool, GPU_WAVE_LIMIT_ENABLE, false,                                   \
@@ -136,6 +176,10 @@ release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "",                              \
         "File path prefix for dumping wave limiter output")                   \
 release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "",                             \
         "File path prefix for tracing wave limiter")                          \
+release(bool, OCL_CODE_CACHE_ENABLE, false,                                   \
+        "1 = Enable compiler code cache")                                     \
+release(bool, OCL_CODE_CACHE_RESET, false,                                    \
+        "1 =  Reset the compiler code cache storage")                         \
 release(bool, PAL_DISABLE_SDMA, false,                                        \
         "1 = Disable SDMA for PAL")                                           \
 release(uint, PAL_RGP_DISP_COUNT, 10000,                                      \