From 3f4bbcfdba1aa947ca3cd2d22c85cb36613e2274 Mon Sep 17 00:00:00 2001
From: German <German.Andryeyev@amd.com>
Date: Wed, 23 Aug 2023 13:35:56 -0400
Subject: [PATCH] SWDEV-407533 -  [ABI Break]Purge unused env vars

Change-Id: I627950e8ebb6299affc602754a20d442dbe42b14


[ROCm/clr commit: 077311153a251ae0fc6b99a66944a66cc3abe8da]
---
 projects/clr/rocclr/device/device.cpp         |  14 --
 projects/clr/rocclr/device/device.hpp         |   8 +-
 projects/clr/rocclr/device/devkernel.cpp      | 148 ++++++++----------
 projects/clr/rocclr/device/pal/paldevice.cpp  |   4 +-
 .../clr/rocclr/device/pal/palsettings.cpp     | 121 ++------------
 .../clr/rocclr/device/pal/palsettings.hpp     |   7 +-
 projects/clr/rocclr/device/pal/palvirtual.cpp |  85 +---------
 projects/clr/rocclr/device/pal/palvirtual.hpp |  35 -----
 projects/clr/rocclr/device/rocm/rocdevice.hpp |   2 +-
 .../clr/rocclr/device/rocm/rocsettings.cpp    |  25 ---
 .../clr/rocclr/device/rocm/rocsettings.hpp    |  13 +-
 .../clr/rocclr/device/rocm/rocvirtual.cpp     |   2 +-
 projects/clr/rocclr/utils/flags.hpp           |  48 +-----
 projects/clr/rocclr/utils/macros.hpp          |   6 -
 14 files changed, 88 insertions(+), 430 deletions(-)

diff --git a/projects/clr/rocclr/device/device.cpp b/projects/clr/rocclr/device/device.cpp
index bf88996f47..0249f31d6d 100644
--- a/projects/clr/rocclr/device/device.cpp
+++ b/projects/clr/rocclr/device/device.cpp
@@ -933,20 +933,6 @@ Settings::Settings() : value_(0) {
   customHostAllocator_ = false;
   waitCommand_ = AMD_OCL_WAIT_COMMAND;
   supportDepthsRGB_ = false;
-  commandQueues_ = 200;  //!< Field value set to maximum number
-                         //!< concurrent Virtual GPUs for default
-
-  overrideLclSet = (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE)) ? 1 : 0;
-  overrideLclSet |=
-      (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y))
-      ? 2
-      : 0;
-  overrideLclSet |=
-      (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
-       !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z))
-      ? 4
-      : 0;
-
   fenceScopeAgent_ = AMD_OPT_FLUSH;
   if (amd::IS_HIP) {
     if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) {
diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index 5074c6fb72..86e3caff1a 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -640,7 +640,6 @@ class Settings : public amd::HeapObject {
   uint64_t extensions_;  //!< Supported OCL extensions
   union {
     struct {
-      uint overrideLclSet : 3;        //!< Bit mask to override the local size
       uint apuSystem_ : 1;            //!< Device is APU system with shared memory
       uint supportRA_ : 1;            //!< Support RA channel order format
       uint waitCommand_ : 1;          //!< Enables a wait for every submitted command
@@ -660,14 +659,11 @@ class Settings : public amd::HeapObject {
       uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device
       uint fenceScopeAgent_ : 1;      //!< Enable fence scope agent in AQL dispatch packet
       uint rocr_backend_ : 1;         //!< Device uses ROCr backend for submissions
-      uint reserved_ : 11;
+      uint reserved_ : 14;
     };
     uint value_;
   };
 
-  uint commandQueues_;  //!< Field value for maximum number
-                        //!< concurrent Virtual GPUs for each backend
-
   //! Default constructor
   Settings();
 
@@ -1383,7 +1379,7 @@ class Isa {
 
   /// @returns If the ROCm runtime supports the ISA.
   bool runtimeRocSupported() const {
-    if (!IS_HIP && !ROC_ENABLE_PRE_VEGA && (versionMajor_ == 8)) {
+    if (!IS_HIP && (versionMajor_ == 8)) {
       return false;
     }
     return runtimeRocSupported_;
diff --git a/projects/clr/rocclr/device/devkernel.cpp b/projects/clr/rocclr/device/devkernel.cpp
index 4bddff1982..db2bdfb81c 100644
--- a/projects/clr/rocclr/device/devkernel.cpp
+++ b/projects/clr/rocclr/device/devkernel.cpp
@@ -684,102 +684,82 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
   if (workGroupInfo()->compileSize_[0] == 0) {
     // Find the default local workgroup size, if it wasn't specified
     if (lclWorkSize[0] == 0) {
-      if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
-        // Find threads per group
-        size_t thrPerGrp = workGroupInfo()->size_;
+      // Find threads per group
+      size_t thrPerGrp = workGroupInfo()->size_;
 
-        // Check if kernel uses images
-        if (flags_.imageEna_ &&
-          // and thread group is a multiple value of wavefronts
-          ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
-          // and it's 2 or 3-dimensional workload
-          (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
-          // Use 8x8 workgroup size if kernel has image writes
-          if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
-            lclWorkSize[0] = 8;
-            lclWorkSize[1] = 8;
-          }
-          else {
-            lclWorkSize[0] = 16;
-            lclWorkSize[1] = 16;
-          }
-          if (workDim == 3) {
-            lclWorkSize[2] = 1;
-          }
+      // Check if kernel uses images
+      if (flags_.imageEna_ &&
+        // and thread group is a multiple value of wavefronts
+        ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
+        // and it's 2 or 3-dimensional workload
+        (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
+        // Use 8x8 workgroup size if kernel has image writes
+        if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
+          lclWorkSize[0] = 8;
+          lclWorkSize[1] = 8;
         }
         else {
-          size_t tmp = thrPerGrp;
-          // Split the local workgroup into the most efficient way
-          for (uint d = 0; d < workDim; ++d) {
-            size_t div = tmp;
-            for (; (gblWorkSize[d] % div) != 0; div--)
-              ;
-            lclWorkSize[d] = div;
-            tmp /= div;
-          }
+          lclWorkSize[0] = 16;
+          lclWorkSize[1] = 16;
+        }
+        if (workDim == 3) {
+          lclWorkSize[2] = 1;
+        }
+      }
+      else {
+        size_t tmp = thrPerGrp;
+        // Split the local workgroup into the most efficient way
+        for (uint d = 0; d < workDim; ++d) {
+          size_t div = tmp;
+          for (; (gblWorkSize[d] % div) != 0; div--)
+            ;
+          lclWorkSize[d] = div;
+          tmp /= div;
+        }
 
-          if (!workGroupInfo()->uniformWorkGroupSize_) {
-            // Assuming DWORD access
-            const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
+        if (!workGroupInfo()->uniformWorkGroupSize_) {
+          // Assuming DWORD access
+          const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
 
-            // Check if we couldn't find optimal workload
-            if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
-                // or size is too small for the cache line
-              (lclWorkSize[0] < cacheLineMatch)) {
-              size_t maxSize = 0;
-              size_t maxDim = 0;
+          // Check if we couldn't find optimal workload
+          if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
+              // or size is too small for the cache line
+            (lclWorkSize[0] < cacheLineMatch)) {
+            size_t maxSize = 0;
+            size_t maxDim = 0;
+            for (uint d = 0; d < workDim; ++d) {
+              if (maxSize < gblWorkSize[d]) {
+                maxSize = gblWorkSize[d];
+                maxDim = d;
+              }
+            }
+            // Use X dimension as high priority. Runtime will assume that
+            // X dimension is more important for the address calculation
+            if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
+              lclWorkSize[0] = cacheLineMatch;
+              thrPerGrp /= cacheLineMatch;
+              lclWorkSize[maxDim] = thrPerGrp;
+              for (uint d = 1; d < workDim; ++d) {
+                if (d != maxDim) {
+                  lclWorkSize[d] = 1;
+                }
+              }
+            }
+            else {
+              // Check if a local workgroup has the most optimal size
+              if (thrPerGrp > maxSize) {
+                thrPerGrp = maxSize;
+              }
+              lclWorkSize[maxDim] = thrPerGrp;
               for (uint d = 0; d < workDim; ++d) {
-                if (maxSize < gblWorkSize[d]) {
-                  maxSize = gblWorkSize[d];
-                  maxDim = d;
-                }
-              }
-              // Use X dimension as high priority. Runtime will assume that
-              // X dimension is more important for the address calculation
-              if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
-                lclWorkSize[0] = cacheLineMatch;
-                thrPerGrp /= cacheLineMatch;
-                lclWorkSize[maxDim] = thrPerGrp;
-                for (uint d = 1; d < workDim; ++d) {
-                  if (d != maxDim) {
-                    lclWorkSize[d] = 1;
-                  }
-                }
-              }
-              else {
-                // Check if a local workgroup has the most optimal size
-                if (thrPerGrp > maxSize) {
-                  thrPerGrp = maxSize;
-                }
-                lclWorkSize[maxDim] = thrPerGrp;
-                for (uint d = 0; d < workDim; ++d) {
-                  if (d != maxDim) {
-                    lclWorkSize[d] = 1;
-                  }
+                if (d != maxDim) {
+                  lclWorkSize[d] = 1;
                 }
               }
             }
           }
         }
       }
-      else {
-        // Use overrides when app doesn't provide workgroup dimensions
-        if (workDim == 1) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
-        }
-        else if (workDim == 2) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
-          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
-        }
-        else if (workDim == 3) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
-          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
-          lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
-        }
-        else {
-          assert(0 && "Invalid workDim!");
-        }
-      }
     }
   }
   else {
diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp
index 55585f3d34..d79daf71b8 100644
--- a/projects/clr/rocclr/device/pal/paldevice.cpp
+++ b/projects/clr/rocclr/device/pal/paldevice.cpp
@@ -300,7 +300,7 @@ bool NullDevice::create(const char* palName, const amd::Isa& isa, Pal::GfxIpLeve
                                nullptr,
                                nullptr,
                                nullptr,
-                               AMD_OCL_SC_LIB};
+                               nullptr};
     // Initialize the compiler handle
     acl_error error;
     compiler_ = amd::Hsail::CompilerInit(&opts, &error);
@@ -1029,7 +1029,7 @@ bool Device::create(Pal::IDevice* device) {
                                nullptr,
                                nullptr,
                                nullptr,
-                               AMD_OCL_SC_LIB};
+                               nullptr};
     // Initialize the compiler handle
     acl_error error;
     compiler_ = amd::Hsail::CompilerInit(&opts, &error);
diff --git a/projects/clr/rocclr/device/pal/palsettings.cpp b/projects/clr/rocclr/device/pal/palsettings.cpp
index 41d03af263..877498059a 100644
--- a/projects/clr/rocclr/device/pal/palsettings.cpp
+++ b/projects/clr/rocclr/device/pal/palsettings.cpp
@@ -33,19 +33,6 @@
 
 namespace pal {
 
-/*! \brief information for adjusting maximum workload time
- *
- *  This structure contains the time and OS minor version for max workload time
- *  adjustment for Windows 7 or 8.
- */
-struct ModifyMaxWorkload {
-  uint32_t time;          //!< max work load time (10x ms)
-  uint32_t minorVersion;  //!< OS minor version
-#if defined(_WIN32)
-  BYTE comparisonOps;  //!< Comparison option
-#endif
-};
-
 Settings::Settings() {
   // Initialize the GPU device default settings
   oclVersion_ = OpenCL12;
@@ -73,9 +60,6 @@ Settings::Settings() {
 
   libSelector_ = amd::LibraryUndefined;
 
-  // Enable workload split by default (for 24 bit arithmetic or timeout)
-  workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT;
-
   // By default use host blit
   blitEngine_ = BlitEngineHost;
   pinnedXferSize_ = GPU_PINNED_XFER_SIZE * Mi;
@@ -117,15 +101,10 @@ Settings::Settings() {
   // Number of compute rings.
   numComputeRings_ = 0;
 
-  minWorkloadTime_ = 1;       // 0.1 ms
-  maxWorkloadTime_ = 500000;  // 500 ms
-
   // Controls tiled images in persistent
   //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
   linearPersistentImage_ = false;
 
-  useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
-
   // Device enqueuing settings
   numDeviceEvents_ = 1024;
   numWaitEvents_ = 8;
@@ -177,14 +156,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
 
   // Disable thread trace by default for all devices
   threadTraceEnable_ = false;
-  bool doublePrecision = true;
-
-  // Update GPU specific settings and info structure if we have any
-#if defined(_WIN32)
-  ModifyMaxWorkload modifyMaxWorkload = {0, 1, VER_EQUAL};
-#else
-  ModifyMaxWorkload modifyMaxWorkload = {0};
-#endif
 
   // APU systems
   if (palProp.gpuType == Pal::GpuType::Integrated) {
@@ -250,14 +221,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
         // GFX10.1 HW doesn't support custom pitch. Enable double copy workaround
         imageBufferWar_ = GPU_IMAGE_BUFFER_WAR;
       }
-      if (false) {
-        // UnknownDevice0 HW doesn't have SDMA engine
-        disableSdma_ = true;
-        // And LDS is limited to 32KB
-        hwLDSSize_ = 32 * Ki;
-        // No fp64 support
-        doublePrecision = false;
-      }
       // Fall through to AI (gfx9) ...
     case Pal::AsicRevision::Vega20:
       // Enable HW P2P path for Vega20+. Runtime still relies on KMD/PAL for support
@@ -277,15 +240,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
     case Pal::AsicRevision::Carrizo:
     case Pal::AsicRevision::Bristol:
     case Pal::AsicRevision::Stoney:
-      if (!aiPlus_) {
-        // Fix BSOD/TDR issues observed on Stoney Win7/8.1/10
-        minWorkloadTime_ = 1000;
-        modifyMaxWorkload.time = 1000;       // Decided by experiment
-        modifyMaxWorkload.minorVersion = 1;  // Win 7
-#if defined(_WIN32)
-        modifyMaxWorkload.comparisonOps = VER_EQUAL;  // Limit to Win 7 only
-#endif
-      }
     case Pal::AsicRevision::Iceland:
     case Pal::AsicRevision::Tonga:
     case Pal::AsicRevision::Fiji:
@@ -307,15 +261,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
     case Pal::AsicRevision::Godavari:
     case Pal::AsicRevision::Spectre:
     case Pal::AsicRevision::Spooky:
-      if (!viPlus_) {
-        // Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903)
-        modifyMaxWorkload.time = 250000;     // 250ms
-        modifyMaxWorkload.minorVersion = 1;  // Win 7
-#if defined(_WIN32)
-        modifyMaxWorkload.comparisonOps = VER_EQUAL;  // limit to Win 7
-#endif
-      }
-    // Fall through ...
     case Pal::AsicRevision::Bonaire:
     case Pal::AsicRevision::Hawaii:
     case Pal::AsicRevision::HawaiiPro:
@@ -331,13 +276,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
 
       libSelector_ = amd::GPU_Library_CI;
       if (LP64_SWITCH(false, true)) {
-        oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
-            ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
-            : OpenCL12;
-      }
-      if (GPU_FORCE_OCL20_32BIT) {
-        force32BitOcl20_ = true;
-        oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
+        oclVersion_ = !reportAsOCL12Device
             ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
             : OpenCL12;
       }
@@ -348,28 +287,14 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
 
       // Cap at OpenCL20 for now
       if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20;
-
-      // This needs to be cleaned once 64bit addressing is stable
-      if (oclVersion_ < OpenCL20) {
-        use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR)
-            ? LP64_SWITCH(false,
-                          /*calAttr.isWorkstation ||*/ true)
-            : GPU_FORCE_64BIT_PTR;
-      } else {
-        if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) {
-          use64BitPtr_ = true;
-        }
-      }
+      
+      use64BitPtr_ = LP64_SWITCH(false, true);
 
       if (oclVersion_ >= OpenCL20) {
         supportDepthsRGB_ = true;
       }
       if (use64BitPtr_) {
-        if (GPU_ENABLE_LARGE_ALLOCATION) {
-          maxAllocSize_ = 64ULL * Gi;
-        } else {
-          maxAllocSize_ = 4048 * Mi;
-        }
+        maxAllocSize_ = 64ULL * Gi;
       } else {
         maxAllocSize_ = 3ULL * Gi;
       }
@@ -395,26 +320,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
   // Image DMA must be disabled if SDMA is disabled
   imageDMA_ &= !disableSdma_;
 
-  splitSizeForWin7_ = false;
-
-#if defined(_WIN32)
-  OSVERSIONINFOEX versionInfo = {0};
-  versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
-  versionInfo.dwMajorVersion = 6;
-  versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;
-
-  DWORDLONG conditionMask = 0;
-  VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
-  VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
-
-  if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
-    splitSizeForWin7_ = true;  // Update flag of DMA flush split size for Win 7
-    if (modifyMaxWorkload.time > 0) {
-      maxWorkloadTime_ = modifyMaxWorkload.time;  // Update max workload time
-    }
-  }
-#endif  // defined(_WIN32)
-
   // Enable atomics support
   enableExtension(ClKhrInt64BaseAtomics);
   enableExtension(ClKhrInt64ExtendedAtomics);
@@ -457,23 +362,19 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
   // HW doesn't support untiled image writes
   // hostMemDirectAccess_ |= HostMemImage;
 
-  if (doublePrecision) {
-    // Report FP_FAST_FMA define if double precision HW
-    reportFMA_ = true;
-    // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
-    // Bonaire, Kalindi, Spectre and Spooky so disable
-    // FP_FMA_FMAF for those parts in switch below
-    reportFMAF_ = true;
-  }
+  // Report FP_FAST_FMA define if double precision HW
+  reportFMA_ = true;
+  // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
+  // Bonaire, Kalindi, Spectre and Spooky so disable
+  // FP_FMA_FMAF for those parts in switch below
+  reportFMAF_ = true;
 
-  // Make sure device actually supports double precision
-  doublePrecision_ = (doublePrecision) ? doublePrecision_ : false;
   if (doublePrecision_) {
     // Enable KHR double precision extension
     enableExtension(ClKhrFp64);
   }
 
-  if (!useLightning_ && doublePrecision) {
+  if (!useLightning_) {
     // Enable AMD double precision extension
     doublePrecision_ = true;
     enableExtension(ClAmdFp64);
diff --git a/projects/clr/rocclr/device/pal/palsettings.hpp b/projects/clr/rocclr/device/pal/palsettings.hpp
index 66984622ee..32d3ad2a14 100644
--- a/projects/clr/rocclr/device/pal/palsettings.hpp
+++ b/projects/clr/rocclr/device/pal/palsettings.hpp
@@ -70,19 +70,17 @@ class Settings : public device::Settings {
       uint gfx10Plus_ : 1;              //!< gfx10 and post gfx10 features
       uint threadTraceEnable_ : 1;      //!< Thread trace enable
       uint linearPersistentImage_ : 1;  //!< Allocates linear images in persistent
-      uint useSingleScratch_ : 1;       //!< Allocates single scratch per device
       uint svmAtomics_ : 1;             //!< SVM device atomics
       uint svmFineGrainSystem_ : 1;     //!< SVM fine grain system support
       uint useDeviceQueue_ : 1;         //!< Submit to separate device queue
       uint sdamPageFaultWar_ : 1;       //!< SDMA page fault workaround
       uint rgpSqttWaitIdle_ : 1;        //!< Wait for idle after SQTT trace
       uint rgpSqttForceDisable_ : 1;    //!< Disables SQTT
-      uint splitSizeForWin7_ : 1;       //!< DMA flush split size for Win 7
       uint enableHwP2P_ : 1;            //!< Forces HW P2P path for testing
       uint imageBufferWar_ : 1;         //!< Image buffer workaround for Gfx10
       uint disableSdma_ : 1;            //!< Disable SDMA support
       uint alwaysResident_ : 1;         //!< Make resources resident at allocation time
-      uint reserved_ : 7;
+      uint reserved_ : 9;
     };
     uint value_;
   };
@@ -92,9 +90,6 @@ class Settings : public device::Settings {
   uint hwLDSSize_;               //!< HW local data store size
   uint maxWorkGroupSize_;        //!< Requested workgroup size for this device
   uint preferredWorkGroupSize_;  //!< Requested preferred workgroup size for this device
-  uint workloadSplitSize_;       //!< Workload split size
-  uint minWorkloadTime_;         //!< Minimal workload time in 0.1 ms
-  uint maxWorkloadTime_;         //!< Maximum workload time in 0.1 ms
   uint blitEngine_;              //!< Blit engine type
   uint cacheLineSize_;           //!< Cache line size in bytes
   uint cacheSize_;               //!< L1 cache size in bytes
diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp
index 234ef67882..acd9fba3be 100644
--- a/projects/clr/rocclr/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/device/pal/palvirtual.cpp
@@ -697,61 +697,6 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
   }
 }
 
-VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev) : cbWorkload_(0), dispatchSplitSize_(0) {
-  aluCnt_ = dev.properties().gfxipProperties.shaderCore.numSimdsPerCu * dev.info().simdWidth_ *
-      dev.info().maxComputeUnits_;
-  maxDispatchWorkload_ = static_cast<uint64_t>(dev.info().maxEngineClockFrequency_) *
-      // find time in us
-      dev.settings().maxWorkloadTime_ * aluCnt_;
-  resetCbWorkload(dev);
-}
-
-void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) {
-  cbWorkload_ = 0;
-  maxCbWorkload_ = static_cast<uint64_t>(dev.info().maxEngineClockFrequency_) *
-      // find time in us
-      dev.settings().minWorkloadTime_ * aluCnt_;
-}
-
-void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads,
-                                             uint instructions) {
-  if (!dev.settings().splitSizeForWin7_) {
-    dispatchSplitSize_ = 0;
-    return;
-  }
-
-  uint64_t workload = threads * instructions;
-  if (maxDispatchWorkload_ < workload) {
-    dispatchSplitSize_ = static_cast<uint>(maxDispatchWorkload_ / instructions);
-    uint fullLoad = dev.info().maxComputeUnits_ * dev.info().preferredWorkGroupSize_;
-    if ((dispatchSplitSize_ % fullLoad) != 0) {
-      dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad;
-    }
-  } else {
-    dispatchSplitSize_ =
-        (threads > dev.settings().workloadSplitSize_) ? dev.settings().workloadSplitSize_ : 0;
-  }
-}
-
-bool VirtualGPU::DmaFlushMgmt::isCbReady(VirtualGPU& gpu, uint64_t threads, uint instructions) {
-  bool cbReady = false;
-  uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions;
-  // Add current workload to the overall workload in the current DMA
-  cbWorkload_ += workload;
-  // Did it exceed maximum?
-  if (cbWorkload_ > maxCbWorkload_) {
-    // Reset DMA workload
-    cbWorkload_ = 0;
-    // Increase workload of the next DMA buffer by 50%
-    maxCbWorkload_ = maxCbWorkload_ * 3 / 2;
-    if (maxCbWorkload_ > maxDispatchWorkload_) {
-      maxCbWorkload_ = maxDispatchWorkload_;
-    }
-    cbReady = true;
-  }
-  return cbReady;
-}
-
 void VirtualGPU::addPinnedMem(amd::Memory* mem) {
   if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
     if (pinnedMems_.size() > 7) {
@@ -897,7 +842,6 @@ VirtualGPU::VirtualGPU(Device& device)
       gpuDevice_(static_cast<Device&>(device)),
       printfDbgHSA_(nullptr),
       tsCache_(nullptr),
-      dmaFlushMgmt_(device),
       managedBuffer_(*this, device.settings().stagedXferSize_ + 32 * Ki),
       writeBuffer_(device, managedBuffer_, device.settings().stagedXferSize_),
       hwRing_(0),
@@ -932,11 +876,6 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
   // because destructor calls eraseResourceList() even if create() failed
   dev().resizeResoureList(index());
 
-  if (index() >= GPU_MAX_COMMAND_QUEUES) {
-    // Cap the maximum number of concurrent Virtual GPUs
-    return false;
-  }
-
   // Virtual GPU will have profiling enabled
   state_.profiling_ = profiling;
 
@@ -2632,16 +2571,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
   // Add ISA memory object to the resource tracking list
   AddKernel(kernel);
 
-  bool needFlush = false;
-  // Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
-  // are in the same cmdBuffer
-  if (!state_.perfCounterEnabled_) {
-    dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize());
-    if (dmaFlushMgmt().dispatchSplitSize() != 0) {
-      needFlush = true;
-    }
-  }
-
   // Check if it is blit kernel. If it is, then check if split is needed.
   if (hsaKernel.isInternalKernel()) {
     // Calculate new group size for each submission
@@ -2737,7 +2666,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
     }
 
     // Update the global GPU event
-    setGpuEvent(gpuEvent, needFlush);
+    constexpr bool kNeedFLush = false;
+    setGpuEvent(gpuEvent, kNeedFLush);
 
     if (printfEnabled && !printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
       LogError("Couldn't read printf data from the buffer!\n");
@@ -2799,10 +2729,6 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
     if (!foundEvent) {
       state_.forceWait_ = true;
     }
-    // If we don't have any more batches, then assume GPU is idle
-    else if (cbQueue_.empty()) {
-      dmaFlushMgmt_.resetCbWorkload(dev());
-    }
   }
 }
 
@@ -3325,11 +3251,8 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) {
       cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);
 
       uint64_t endTimeStampCPU = amd::Os::timeNanos();
-      // Make sure the command batch has a valid GPU TS
-      if (!GPU_RAW_TIMESTAMP) {
-        // Adjust the base time by the execution time
-        readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
-      }
+      // Adjust the base time by the execution time
+      readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
     }
   }
 }
diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp
index 25c5f96d66..2595342c74 100644
--- a/projects/clr/rocclr/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/device/pal/palvirtual.hpp
@@ -290,36 +290,6 @@ class VirtualGPU : public device::VirtualDevice {
     size_t maxMemObjectsInQueue_;     //!< Maximum number of mem objects in the queue
   };
 
-  class DmaFlushMgmt : public amd::EmbeddedObject {
-   public:
-    DmaFlushMgmt(const Device& dev);
-
-    // Resets DMA command buffer workload
-    void resetCbWorkload(const Device& dev);
-
-    // Finds split size for the current dispatch
-    void findSplitSize(const Device& dev,  //!< GPU device object
-                       uint64_t threads,   //!< Total number of execution threads
-                       uint instructions   //!< Number of ALU instructions
-    );
-
-    // Returns TRUE if DMA command buffer is ready for a flush
-    bool isCbReady(VirtualGPU& gpu,   //!< Virtual GPU object
-                   uint64_t threads,  //!< Total number of execution threads
-                   uint instructions  //!< Number of ALU instructions
-    );
-
-    // Returns dispatch split size
-    uint dispatchSplitSize() const { return dispatchSplitSize_; }
-
-   private:
-    uint64_t maxDispatchWorkload_;  //!< Maximum number of operations for a single dispatch
-    uint64_t maxCbWorkload_;        //!< Maximum number of operations for DMA command buffer
-    uint64_t cbWorkload_;           //!< Current number of operations in DMA command buffer
-    uint aluCnt_;                   //!< All ALUs on the chip
-    uint dispatchSplitSize_;        //!< Dispath split size in elements
-  };
-
  public:
   VirtualGPU(Device& device);
   //! Creates virtual gpu object
@@ -470,9 +440,6 @@ class VirtualGPU : public device::VirtualDevice {
   //! Returns hsaQueueMem_
   const Memory* hsaQueueMem() const { return hsaQueueMem_; }
 
-  //! Returns DMA flush management structure
-  const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; }
-
   //! Returns the HW ring used on this virtual device
   uint hwRing() const { return hwRing_; }
 
@@ -695,8 +662,6 @@ class VirtualGPU : public device::VirtualDevice {
   TimeStampCache* tsCache_;            //!< TimeStamp cache
   MemoryDependency memoryDependency_;  //!< Memory dependency class
 
-  DmaFlushMgmt dmaFlushMgmt_;  //!< DMA flush management
-
   std::vector<amd::Memory*> pinnedMems_;  //!< Pinned memory list
 
   ManagedBuffer managedBuffer_;  //!< Managed write buffer
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index 763d941b4b..9cfe9d3b09 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -233,7 +233,7 @@ class NullDevice : public amd::Device {
 
   //! Determine if we can use device memory for SVM
   const bool forceFineGrain(amd::Memory* memory) const {
-    return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1);
+    return (memory->getContext().devices().size() > 1);
   }
 
   virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle,
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp
index 71c341ad43..e747c9c2f6 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.cpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp
@@ -36,17 +36,10 @@ Settings::Settings() {
   doublePrecision_ = ::CL_KHR_FP64;
 
   enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE;
-  enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM;
 
   maxWorkGroupSize_ = 1024;
   preferredWorkGroupSize_ = 256;
 
-  maxWorkGroupSize2DX_ = 16;
-  maxWorkGroupSize2DY_ = 16;
-  maxWorkGroupSize3DX_ = 4;
-  maxWorkGroupSize3DY_ = 4;
-  maxWorkGroupSize3DZ_ = 4;
-
   kernargPoolSize_ = HSA_KERNARG_POOL_SIZE;
 
   // Determine if user is requesting Non-Coherent mode
@@ -96,7 +89,6 @@ Settings::Settings() {
   cpu_wait_for_signal_ = (!flagIsDefault(ROC_CPU_WAIT_FOR_SIGNAL)) ?
                           ROC_CPU_WAIT_FOR_SIGNAL : cpu_wait_for_signal_;
   system_scope_signal_ = ROC_SYSTEM_SCOPE_SIGNAL;
-  skip_copy_sync_      = ROC_SKIP_COPY_SYNC;
 
   // Use coarse grain system memory for kernel arguments by default (to keep GPU cache)
   fgs_kernel_arg_ = false;
@@ -201,23 +193,6 @@ void Settings::override() {
     preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
   }
 
-  if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) {
-    maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X;
-  }
-  if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) {
-    maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y;
-  }
-
-  if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) {
-    maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X;
-  }
-  if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) {
-    maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y;
-  }
-  if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) {
-    maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z;
-  }
-
   if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
     xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
   }
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp
index d2fffd73db..4f745f8521 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.hpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp
@@ -43,7 +43,6 @@ class Settings : public device::Settings {
     struct {
       uint doublePrecision_ : 1;        //!< Enables double precision support
       uint enableLocalMemory_ : 1;      //!< Enable GPUVM memory
-      uint enableCoarseGrainSVM_ : 1;   //!< Enable device memory for coarse grain SVM allocations
       uint enableNCMode_ : 1;           //!< Enable Non Coherent mode for system memory
       uint imageDMA_ : 1;               //!< Enable direct image DMA transfers
       uint stagedXferRead_ : 1;         //!< Uses a staged buffer read
@@ -51,11 +50,10 @@ class Settings : public device::Settings {
       uint imageBufferWar_ : 1;         //!< Image buffer workaround for Gfx10
       uint cpu_wait_for_signal_ : 1;    //!< Wait for HSA signal on CPU
       uint system_scope_signal_ : 1;    //!< HSA signal is visibile to the entire system
-      uint skip_copy_sync_ : 1;         //!< Ignore explicit HSA signal waits for copy functionality
       uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
       uint coop_sync_ : 1;              //!< grid and multi-grid sync for gfx940+
       uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
-      uint reserved_ : 18;
+      uint reserved_ : 20;
     };
     uint value_;
   };
@@ -66,15 +64,6 @@ class Settings : public device::Settings {
   //! Preferred workgroup size
   uint preferredWorkGroupSize_;
 
-  //! Default max workgroup sizes for 2D
-  int maxWorkGroupSize2DX_;
-  int maxWorkGroupSize2DY_;
-
-  //! Default max workgroup sizes for 3D
-  int maxWorkGroupSize3DX_;
-  int maxWorkGroupSize3DY_;
-  int maxWorkGroupSize3DZ_;
-
   uint kernargPoolSize_;
   uint numDeviceEvents_;      //!< The number of device events
   uint numWaitEvents_;        //!< The number of wait events for device enqueue
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 129e7466b2..6450dd86ae 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -490,7 +490,7 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
       // Check if skip wait optimization is enabled. It will try to predict the same engine in ROCr
       // and ignore the signal wait, relying on in-order engine execution
       const Settings& settings = gpu_.dev().settings();
-      if (!settings.skip_copy_sync_ && (engine != HwQueueEngine::Compute)) {
+      if (engine != HwQueueEngine::Compute) {
         explicit_wait = true;
       }
     }
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index 3baae3e590..809278d1b4 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -30,22 +30,10 @@ release(uint, AMD_LOG_MASK, 0X7FFFFFFF,                                       \
         "The mask to enable specific kinds of logs")                          \
 debug(uint, DEBUG_GPU_FLAGS, 0,                                               \
         "The debug options for GPU device")                                   \
-release(uint, GPU_MAX_COMMAND_QUEUES, 300,                                    \
-        "The maximum number of concurrent Virtual GPUs")                      \
 release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */         \
         "The default command queue thread stack size")                        \
 release(int, GPU_MAX_WORKGROUP_SIZE, 0,                                       \
         "Maximum number of workitems in a workgroup for GPU, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0,                                  \
-        "Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0,                                  \
-        "Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0,                                  \
-        "Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0,                                  \
-        "Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0,                                  \
-        "Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \
 debug(bool, CPU_MEMORY_GUARD_PAGES, false,                                    \
         "Use guard pages for CPU memory")                                     \
 debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64,                                 \
@@ -74,8 +62,6 @@ release(uint, GPU_BLIT_ENGINE_TYPE, 0x0,                                      \
         "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel")       \
 release(bool, GPU_FLUSH_ON_EXECUTION, false,                                  \
         "Submit commands to HW on every operation. 0 - Disable, 1 - Enable")  \
-release(bool, GPU_USE_SYNC_OBJECTS, true,                                     \
-        "If enabled, use sync objects instead of polling")                    \
 release(bool, CL_KHR_FP64, true,                                              \
         "Enable/Disable support for double precision")                        \
 release(cstring, AMD_OCL_BUILD_OPTIONS, 0,                                    \
@@ -86,12 +72,8 @@ release(cstring, AMD_OCL_LINK_OPTIONS, 0,                                     \
         "Set clLinkProgram()'s options (override)")                           \
 release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0,                              \
         "Append clLinkProgram()'s options")                                   \
-release(cstring, AMD_OCL_SC_LIB, 0,                                           \
-        "Set shader compiler shared library name or path")                    \
 debug(cstring, AMD_OCL_SUBST_OBJFILE, 0,                                      \
         "Specify binary substitution config file for OpenCL")                 \
-debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false,                                \
-        "Enable the error dialog on Windows")                                 \
 release(size_t, GPU_PINNED_XFER_SIZE, 32,                                     \
         "The pinned buffer size for pinning in read/write transfers in MiB")  \
 release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128,                                \
@@ -100,12 +82,6 @@ release(size_t, GPU_RESOURCE_CACHE_SIZE, 64,                                  \
         "The resource cache size in MB")                                      \
 release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096,                                  \
         "The maximum size accepted for suballocaitons in KB")                 \
-release(bool, GPU_FORCE_64BIT_PTR, 0,                                         \
-        "Forces 64 bit pointers on GPU")                                      \
-release(bool, GPU_FORCE_OCL20_32BIT, 0,                                       \
-        "Forces 32 bit apps to take CLANG\HSAIL path")                        \
-release(bool, GPU_RAW_TIMESTAMP, 0,                                           \
-        "Reports GPU raw timestamps in GPU timeline")                         \
 release(size_t, GPU_NUM_MEM_DEPENDENCY, 256,                                  \
         "Number of memory objects for dependency tracking")                   \
 release(size_t, GPU_XFER_BUFFER_SIZE, 0,                                      \
@@ -116,32 +92,20 @@ release(uint, GPU_SINGLE_ALLOC_PERCENT, 85,                                   \
         "Maximum size of a single allocation as percentage of total")         \
 release(uint, GPU_NUM_COMPUTE_RINGS, 2,                                       \
         "GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \
-release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1,                                 \
-        "GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \
-release(uint, GPU_WORKLOAD_SPLIT, 22,                                         \
-        "Workload split size")                                                \
-release(bool, GPU_USE_SINGLE_SCRATCH, false,                                  \
-        "Use single scratch buffer per device instead of per HW ring")        \
 release(bool, AMD_OCL_WAIT_COMMAND, false,                                    \
         "1 = Enable a wait for every submitted command")                      \
 release(uint, GPU_PRINT_CHILD_KERNEL, 0,                                      \
         "Prints the specified number of the child kernels")                   \
 release(bool, GPU_USE_DEVICE_QUEUE, false,                                    \
         "Use a dedicated device queue for the actual submissions")            \
-release(bool, GPU_ENABLE_LARGE_ALLOCATION, true,                              \
-        "Enable >4GB single allocations")                                     \
 release(bool, AMD_THREAD_TRACE_ENABLE, true,                                  \
         "Enable thread trace extension")                                      \
-release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200),                        \
+release(uint, OPENCL_VERSION, 200,                                            \
         "Force GPU opencl verison")                                           \
 release(bool, HSA_LOCAL_MEMORY_ENABLE, true,                                  \
         "Enable HSA device local memory usage")                               \
 release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024,                             \
         "Kernarg pool size")                                                  \
-release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true,                              \
-        "Enable device memory for coarse grain SVM allocations")              \
-release(bool, GPU_IFH_MODE, false,                                            \
-        "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \
 release(bool, GPU_MIPMAP, true,                                               \
         "Enables GPU mipmap extension")                                       \
 release(uint, GPU_ENABLE_PAL, 2,                                              \
@@ -152,8 +116,6 @@ release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1,                              \
         "Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \
 release(uint, OCL_SET_SVM_SIZE, 4*16384,                                      \
         "set SVM space size for discrete GPU")                                \
-debug(uint, OCL_SYSMEM_REQUIREMENT, 2,                                        \
-        "Use flag to change the minimum requirement of system memory not to downgrade")        \
 release(uint, GPU_WAVES_PER_SIMD, 0,                                          \
         "Force the number of waves per SIMD (1-10)")                          \
 release(bool, GPU_WAVE_LIMIT_ENABLE, false,                                   \
@@ -176,10 +138,6 @@ release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "",                              \
         "File path prefix for dumping wave limiter output")                   \
 release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "",                             \
         "File path prefix for tracing wave limiter")                          \
-release(bool, OCL_CODE_CACHE_ENABLE, false,                                   \
-        "1 = Enable compiler code cache")                                     \
-release(bool, OCL_CODE_CACHE_RESET, false,                                    \
-        "1 =  Reset the compiler code cache storage")                         \
 release(bool, PAL_DISABLE_SDMA, false,                                        \
         "1 = Disable SDMA for PAL")                                           \
 release(uint, PAL_RGP_DISP_COUNT, 10000,                                      \
@@ -243,10 +201,6 @@ release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true,                                  \
         "Enable CPU wait for dependent HSA signals.")                         \
 release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true,                                  \
         "Enable system scope for signals (uses interrupts).")                 \
-release(bool, ROC_SKIP_COPY_SYNC, false,                                      \
-        "Skips copy syncs if runtime can predict the same engine.")           \
-release(bool, ROC_ENABLE_PRE_VEGA, false,                                     \
-        "Enable support of pre-vega ASICs in ROCm path")                      \
 release(bool, GPU_FORCE_QUEUE_PROFILING, false,                               \
         "Force command queue profiling by default")                           \
 release(bool, HIP_MEM_POOL_SUPPORT, false,                                    \
diff --git a/projects/clr/rocclr/utils/macros.hpp b/projects/clr/rocclr/utils/macros.hpp
index 02fef7599b..aaf7e7bae0 100644
--- a/projects/clr/rocclr/utils/macros.hpp
+++ b/projects/clr/rocclr/utils/macros.hpp
@@ -178,12 +178,6 @@
 #define ALWAYSINLINE
 #endif  // !_MSC_VER
 
-#ifdef BRAHMA
-#define IS_BRAHMA true
-#else
-#define IS_BRAHMA false
-#endif
-
 //! \endcond
 
 #endif  // MACROS_HPP_