SWDEV-407533 - [ABI Break]Purge unused env vars

Change-Id: I627950e8ebb6299affc602754a20d442dbe42b14
2023-08-23 13:35:56 -04:00
commit 077311153a
@@ -933,20 +933,6 @@ Settings::Settings() : value_(0) {
  customHostAllocator_ = false;
  waitCommand_ = AMD_OCL_WAIT_COMMAND;
  supportDepthsRGB_ = false;
-  commandQueues_ = 200;  //!< Field value set to maximum number
-                         //!< concurrent Virtual GPUs for default
-
-  overrideLclSet = (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE)) ? 1 : 0;
-  overrideLclSet |=
-      (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y))
-      ? 2
-      : 0;
-  overrideLclSet |=
-      (!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
-       !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z))
-      ? 4
-      : 0;
-
  fenceScopeAgent_ = AMD_OPT_FLUSH;
  if (amd::IS_HIP) {
    if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) {
@@ -640,7 +640,6 @@ class Settings : public amd::HeapObject {
  uint64_t extensions_;  //!< Supported OCL extensions
  union {
    struct {
-      uint overrideLclSet : 3;        //!< Bit mask to override the local size
      uint apuSystem_ : 1;            //!< Device is APU system with shared memory
      uint supportRA_ : 1;            //!< Support RA channel order format
      uint waitCommand_ : 1;          //!< Enables a wait for every submitted command
@@ -660,14 +659,11 @@ class Settings : public amd::HeapObject {
      uint enableCoopMultiDeviceGroups_ : 1; //!< Enable cooperative groups multi device
      uint fenceScopeAgent_ : 1;      //!< Enable fence scope agent in AQL dispatch packet
      uint rocr_backend_ : 1;         //!< Device uses ROCr backend for submissions
-      uint reserved_ : 11;
+      uint reserved_ : 14;
    };
    uint value_;
  };

-  uint commandQueues_;  //!< Field value for maximum number
-                        //!< concurrent Virtual GPUs for each backend
-
  //! Default constructor
  Settings();

@@ -1383,7 +1379,7 @@ class Isa {

  /// @returns If the ROCm runtime supports the ISA.
  bool runtimeRocSupported() const {
-    if (!IS_HIP && !ROC_ENABLE_PRE_VEGA && (versionMajor_ == 8)) {
+    if (!IS_HIP && (versionMajor_ == 8)) {
      return false;
    }
    return runtimeRocSupported_;
@@ -684,102 +684,82 @@ void Kernel::FindLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
  if (workGroupInfo()->compileSize_[0] == 0) {
    // Find the default local workgroup size, if it wasn't specified
    if (lclWorkSize[0] == 0) {
-      if ((device().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
-        // Find threads per group
-        size_t thrPerGrp = workGroupInfo()->size_;
+      // Find threads per group
+      size_t thrPerGrp = workGroupInfo()->size_;

-        // Check if kernel uses images
-        if (flags_.imageEna_ &&
-          // and thread group is a multiple value of wavefronts
-          ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
-          // and it's 2 or 3-dimensional workload
-          (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
-          // Use 8x8 workgroup size if kernel has image writes
-          if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
-            lclWorkSize[0] = 8;
-            lclWorkSize[1] = 8;
-          }
-          else {
-            lclWorkSize[0] = 16;
-            lclWorkSize[1] = 16;
-          }
-          if (workDim == 3) {
-            lclWorkSize[2] = 1;
-          }
+      // Check if kernel uses images
+      if (flags_.imageEna_ &&
+        // and thread group is a multiple value of wavefronts
+        ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
+        // and it's 2 or 3-dimensional workload
+        (workDim > 1) && (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0))) {
+        // Use 8x8 workgroup size if kernel has image writes
+        if (flags_.imageWriteEna_ || (thrPerGrp != device().info().preferredWorkGroupSize_)) {
+          lclWorkSize[0] = 8;
+          lclWorkSize[1] = 8;
        }
        else {
-          size_t tmp = thrPerGrp;
-          // Split the local workgroup into the most efficient way
-          for (uint d = 0; d < workDim; ++d) {
-            size_t div = tmp;
-            for (; (gblWorkSize[d] % div) != 0; div--)
-              ;
-            lclWorkSize[d] = div;
-            tmp /= div;
-          }
+          lclWorkSize[0] = 16;
+          lclWorkSize[1] = 16;
+        }
+        if (workDim == 3) {
+          lclWorkSize[2] = 1;
+        }
+      }
+      else {
+        size_t tmp = thrPerGrp;
+        // Split the local workgroup into the most efficient way
+        for (uint d = 0; d < workDim; ++d) {
+          size_t div = tmp;
+          for (; (gblWorkSize[d] % div) != 0; div--)
+            ;
+          lclWorkSize[d] = div;
+          tmp /= div;
+        }

-          if (!workGroupInfo()->uniformWorkGroupSize_) {
-            // Assuming DWORD access
-            const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;
+        if (!workGroupInfo()->uniformWorkGroupSize_) {
+          // Assuming DWORD access
+          const uint cacheLineMatch = device().info().globalMemCacheLineSize_ >> 2;

-            // Check if we couldn't find optimal workload
-            if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
-                // or size is too small for the cache line
-              (lclWorkSize[0] < cacheLineMatch)) {
-              size_t maxSize = 0;
-              size_t maxDim = 0;
+          // Check if we couldn't find optimal workload
+          if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
+              // or size is too small for the cache line
+            (lclWorkSize[0] < cacheLineMatch)) {
+            size_t maxSize = 0;
+            size_t maxDim = 0;
+            for (uint d = 0; d < workDim; ++d) {
+              if (maxSize < gblWorkSize[d]) {
+                maxSize = gblWorkSize[d];
+                maxDim = d;
+              }
+            }
+            // Use X dimension as high priority. Runtime will assume that
+            // X dimension is more important for the address calculation
+            if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
+              lclWorkSize[0] = cacheLineMatch;
+              thrPerGrp /= cacheLineMatch;
+              lclWorkSize[maxDim] = thrPerGrp;
+              for (uint d = 1; d < workDim; ++d) {
+                if (d != maxDim) {
+                  lclWorkSize[d] = 1;
+                }
+              }
+            }
+            else {
+              // Check if a local workgroup has the most optimal size
+              if (thrPerGrp > maxSize) {
+                thrPerGrp = maxSize;
+              }
+              lclWorkSize[maxDim] = thrPerGrp;
              for (uint d = 0; d < workDim; ++d) {
-                if (maxSize < gblWorkSize[d]) {
-                  maxSize = gblWorkSize[d];
-                  maxDim = d;
-                }
-              }
-              // Use X dimension as high priority. Runtime will assume that
-              // X dimension is more important for the address calculation
-              if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
-                lclWorkSize[0] = cacheLineMatch;
-                thrPerGrp /= cacheLineMatch;
-                lclWorkSize[maxDim] = thrPerGrp;
-                for (uint d = 1; d < workDim; ++d) {
-                  if (d != maxDim) {
-                    lclWorkSize[d] = 1;
-                  }
-                }
-              }
-              else {
-                // Check if a local workgroup has the most optimal size
-                if (thrPerGrp > maxSize) {
-                  thrPerGrp = maxSize;
-                }
-                lclWorkSize[maxDim] = thrPerGrp;
-                for (uint d = 0; d < workDim; ++d) {
-                  if (d != maxDim) {
-                    lclWorkSize[d] = 1;
-                  }
+                if (d != maxDim) {
+                  lclWorkSize[d] = 1;
                }
              }
            }
          }
        }
      }
-      else {
-        // Use overrides when app doesn't provide workgroup dimensions
-        if (workDim == 1) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
-        }
-        else if (workDim == 2) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
-          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
-        }
-        else if (workDim == 3) {
-          lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
-          lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
-          lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
-        }
-        else {
-          assert(0 && "Invalid workDim!");
-        }
-      }
    }
  }
  else {
@@ -300,7 +300,7 @@ bool NullDevice::create(const char* palName, const amd::Isa& isa, Pal::GfxIpLeve
                               nullptr,
                               nullptr,
                               nullptr,
-                               AMD_OCL_SC_LIB};
+                               nullptr};
    // Initialize the compiler handle
    acl_error error;
    compiler_ = amd::Hsail::CompilerInit(&opts, &error);
@@ -1029,7 +1029,7 @@ bool Device::create(Pal::IDevice* device) {
                               nullptr,
                               nullptr,
                               nullptr,
-                               AMD_OCL_SC_LIB};
+                               nullptr};
    // Initialize the compiler handle
    acl_error error;
    compiler_ = amd::Hsail::CompilerInit(&opts, &error);
@@ -33,19 +33,6 @@

 namespace pal {

-/*! \brief information for adjusting maximum workload time
- *
- *  This structure contains the time and OS minor version for max workload time
- *  adjustment for Windows 7 or 8.
- */
-struct ModifyMaxWorkload {
-  uint32_t time;          //!< max work load time (10x ms)
-  uint32_t minorVersion;  //!< OS minor version
-#if defined(_WIN32)
-  BYTE comparisonOps;  //!< Comparison option
-#endif
-};
-
 Settings::Settings() {
  // Initialize the GPU device default settings
  oclVersion_ = OpenCL12;
@@ -73,9 +60,6 @@ Settings::Settings() {

  libSelector_ = amd::LibraryUndefined;

-  // Enable workload split by default (for 24 bit arithmetic or timeout)
-  workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT;
-
  // By default use host blit
  blitEngine_ = BlitEngineHost;
  pinnedXferSize_ = GPU_PINNED_XFER_SIZE * Mi;
@@ -117,15 +101,10 @@ Settings::Settings() {
  // Number of compute rings.
  numComputeRings_ = 0;

-  minWorkloadTime_ = 1;       // 0.1 ms
-  maxWorkloadTime_ = 500000;  // 500 ms
-
  // Controls tiled images in persistent
  //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
  linearPersistentImage_ = false;

-  useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
-
  // Device enqueuing settings
  numDeviceEvents_ = 1024;
  numWaitEvents_ = 8;
@@ -177,14 +156,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,

  // Disable thread trace by default for all devices
  threadTraceEnable_ = false;
-  bool doublePrecision = true;
-
-  // Update GPU specific settings and info structure if we have any
-#if defined(_WIN32)
-  ModifyMaxWorkload modifyMaxWorkload = {0, 1, VER_EQUAL};
-#else
-  ModifyMaxWorkload modifyMaxWorkload = {0};
-#endif

  // APU systems
  if (palProp.gpuType == Pal::GpuType::Integrated) {
@@ -250,14 +221,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
        // GFX10.1 HW doesn't support custom pitch. Enable double copy workaround
        imageBufferWar_ = GPU_IMAGE_BUFFER_WAR;
      }
-      if (false) {
-        // UnknownDevice0 HW doesn't have SDMA engine
-        disableSdma_ = true;
-        // And LDS is limited to 32KB
-        hwLDSSize_ = 32 * Ki;
-        // No fp64 support
-        doublePrecision = false;
-      }
      // Fall through to AI (gfx9) ...
    case Pal::AsicRevision::Vega20:
      // Enable HW P2P path for Vega20+. Runtime still relies on KMD/PAL for support
@@ -277,15 +240,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
    case Pal::AsicRevision::Carrizo:
    case Pal::AsicRevision::Bristol:
    case Pal::AsicRevision::Stoney:
-      if (!aiPlus_) {
-        // Fix BSOD/TDR issues observed on Stoney Win7/8.1/10
-        minWorkloadTime_ = 1000;
-        modifyMaxWorkload.time = 1000;       // Decided by experiment
-        modifyMaxWorkload.minorVersion = 1;  // Win 7
-#if defined(_WIN32)
-        modifyMaxWorkload.comparisonOps = VER_EQUAL;  // Limit to Win 7 only
-#endif
-      }
    case Pal::AsicRevision::Iceland:
    case Pal::AsicRevision::Tonga:
    case Pal::AsicRevision::Fiji:
@@ -307,15 +261,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
    case Pal::AsicRevision::Godavari:
    case Pal::AsicRevision::Spectre:
    case Pal::AsicRevision::Spooky:
-      if (!viPlus_) {
-        // Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903)
-        modifyMaxWorkload.time = 250000;     // 250ms
-        modifyMaxWorkload.minorVersion = 1;  // Win 7
-#if defined(_WIN32)
-        modifyMaxWorkload.comparisonOps = VER_EQUAL;  // limit to Win 7
-#endif
-      }
-    // Fall through ...
    case Pal::AsicRevision::Bonaire:
    case Pal::AsicRevision::Hawaii:
    case Pal::AsicRevision::HawaiiPro:
@@ -331,13 +276,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,

      libSelector_ = amd::GPU_Library_CI;
      if (LP64_SWITCH(false, true)) {
-        oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
-            ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
-            : OpenCL12;
-      }
-      if (GPU_FORCE_OCL20_32BIT) {
-        force32BitOcl20_ = true;
-        oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/
+        oclVersion_ = !reportAsOCL12Device
            ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR))
            : OpenCL12;
      }
@@ -348,28 +287,14 @@ bool Settings::create(const Pal::DeviceProperties& palProp,

      // Cap at OpenCL20 for now
      if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20;
-
-      // This needs to be cleaned once 64bit addressing is stable
-      if (oclVersion_ < OpenCL20) {
-        use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR)
-            ? LP64_SWITCH(false,
-                          /*calAttr.isWorkstation ||*/ true)
-            : GPU_FORCE_64BIT_PTR;
-      } else {
-        if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) {
-          use64BitPtr_ = true;
-        }
-      }
+      
+      use64BitPtr_ = LP64_SWITCH(false, true);

      if (oclVersion_ >= OpenCL20) {
        supportDepthsRGB_ = true;
      }
      if (use64BitPtr_) {
-        if (GPU_ENABLE_LARGE_ALLOCATION) {
-          maxAllocSize_ = 64ULL * Gi;
-        } else {
-          maxAllocSize_ = 4048 * Mi;
-        }
+        maxAllocSize_ = 64ULL * Gi;
      } else {
        maxAllocSize_ = 3ULL * Gi;
      }
@@ -395,26 +320,6 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
  // Image DMA must be disabled if SDMA is disabled
  imageDMA_ &= !disableSdma_;

-  splitSizeForWin7_ = false;
-
-#if defined(_WIN32)
-  OSVERSIONINFOEX versionInfo = {0};
-  versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
-  versionInfo.dwMajorVersion = 6;
-  versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion;
-
-  DWORDLONG conditionMask = 0;
-  VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps);
-  VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps);
-
-  if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
-    splitSizeForWin7_ = true;  // Update flag of DMA flush split size for Win 7
-    if (modifyMaxWorkload.time > 0) {
-      maxWorkloadTime_ = modifyMaxWorkload.time;  // Update max workload time
-    }
-  }
-#endif  // defined(_WIN32)
-
  // Enable atomics support
  enableExtension(ClKhrInt64BaseAtomics);
  enableExtension(ClKhrInt64ExtendedAtomics);
@@ -457,23 +362,19 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
  // HW doesn't support untiled image writes
  // hostMemDirectAccess_ |= HostMemImage;

-  if (doublePrecision) {
-    // Report FP_FAST_FMA define if double precision HW
-    reportFMA_ = true;
-    // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
-    // Bonaire, Kalindi, Spectre and Spooky so disable
-    // FP_FMA_FMAF for those parts in switch below
-    reportFMAF_ = true;
-  }
+  // Report FP_FAST_FMA define if double precision HW
+  reportFMA_ = true;
+  // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
+  // Bonaire, Kalindi, Spectre and Spooky so disable
+  // FP_FMA_FMAF for those parts in switch below
+  reportFMAF_ = true;

-  // Make sure device actually supports double precision
-  doublePrecision_ = (doublePrecision) ? doublePrecision_ : false;
  if (doublePrecision_) {
    // Enable KHR double precision extension
    enableExtension(ClKhrFp64);
  }

-  if (!useLightning_ && doublePrecision) {
+  if (!useLightning_) {
    // Enable AMD double precision extension
    doublePrecision_ = true;
    enableExtension(ClAmdFp64);
@@ -70,19 +70,17 @@ class Settings : public device::Settings {
      uint gfx10Plus_ : 1;              //!< gfx10 and post gfx10 features
      uint threadTraceEnable_ : 1;      //!< Thread trace enable
      uint linearPersistentImage_ : 1;  //!< Allocates linear images in persistent
-      uint useSingleScratch_ : 1;       //!< Allocates single scratch per device
      uint svmAtomics_ : 1;             //!< SVM device atomics
      uint svmFineGrainSystem_ : 1;     //!< SVM fine grain system support
      uint useDeviceQueue_ : 1;         //!< Submit to separate device queue
      uint sdamPageFaultWar_ : 1;       //!< SDMA page fault workaround
      uint rgpSqttWaitIdle_ : 1;        //!< Wait for idle after SQTT trace
      uint rgpSqttForceDisable_ : 1;    //!< Disables SQTT
-      uint splitSizeForWin7_ : 1;       //!< DMA flush split size for Win 7
      uint enableHwP2P_ : 1;            //!< Forces HW P2P path for testing
      uint imageBufferWar_ : 1;         //!< Image buffer workaround for Gfx10
      uint disableSdma_ : 1;            //!< Disable SDMA support
      uint alwaysResident_ : 1;         //!< Make resources resident at allocation time
-      uint reserved_ : 7;
+      uint reserved_ : 9;
    };
    uint value_;
  };
@@ -92,9 +90,6 @@ class Settings : public device::Settings {
  uint hwLDSSize_;               //!< HW local data store size
  uint maxWorkGroupSize_;        //!< Requested workgroup size for this device
  uint preferredWorkGroupSize_;  //!< Requested preferred workgroup size for this device
-  uint workloadSplitSize_;       //!< Workload split size
-  uint minWorkloadTime_;         //!< Minimal workload time in 0.1 ms
-  uint maxWorkloadTime_;         //!< Maximum workload time in 0.1 ms
  uint blitEngine_;              //!< Blit engine type
  uint cacheLineSize_;           //!< Cache line size in bytes
  uint cacheSize_;               //!< L1 cache size in bytes
@@ -697,61 +697,6 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
  }
 }

-VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev) : cbWorkload_(0), dispatchSplitSize_(0) {
-  aluCnt_ = dev.properties().gfxipProperties.shaderCore.numSimdsPerCu * dev.info().simdWidth_ *
-      dev.info().maxComputeUnits_;
-  maxDispatchWorkload_ = static_cast<uint64_t>(dev.info().maxEngineClockFrequency_) *
-      // find time in us
-      dev.settings().maxWorkloadTime_ * aluCnt_;
-  resetCbWorkload(dev);
-}
-
-void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) {
-  cbWorkload_ = 0;
-  maxCbWorkload_ = static_cast<uint64_t>(dev.info().maxEngineClockFrequency_) *
-      // find time in us
-      dev.settings().minWorkloadTime_ * aluCnt_;
-}
-
-void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads,
-                                             uint instructions) {
-  if (!dev.settings().splitSizeForWin7_) {
-    dispatchSplitSize_ = 0;
-    return;
-  }
-
-  uint64_t workload = threads * instructions;
-  if (maxDispatchWorkload_ < workload) {
-    dispatchSplitSize_ = static_cast<uint>(maxDispatchWorkload_ / instructions);
-    uint fullLoad = dev.info().maxComputeUnits_ * dev.info().preferredWorkGroupSize_;
-    if ((dispatchSplitSize_ % fullLoad) != 0) {
-      dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad;
-    }
-  } else {
-    dispatchSplitSize_ =
-        (threads > dev.settings().workloadSplitSize_) ? dev.settings().workloadSplitSize_ : 0;
-  }
-}
-
-bool VirtualGPU::DmaFlushMgmt::isCbReady(VirtualGPU& gpu, uint64_t threads, uint instructions) {
-  bool cbReady = false;
-  uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions;
-  // Add current workload to the overall workload in the current DMA
-  cbWorkload_ += workload;
-  // Did it exceed maximum?
-  if (cbWorkload_ > maxCbWorkload_) {
-    // Reset DMA workload
-    cbWorkload_ = 0;
-    // Increase workload of the next DMA buffer by 50%
-    maxCbWorkload_ = maxCbWorkload_ * 3 / 2;
-    if (maxCbWorkload_ > maxDispatchWorkload_) {
-      maxCbWorkload_ = maxDispatchWorkload_;
-    }
-    cbReady = true;
-  }
-  return cbReady;
-}
-
 void VirtualGPU::addPinnedMem(amd::Memory* mem) {
  if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
    if (pinnedMems_.size() > 7) {
@@ -897,7 +842,6 @@ VirtualGPU::VirtualGPU(Device& device)
      gpuDevice_(static_cast<Device&>(device)),
      printfDbgHSA_(nullptr),
      tsCache_(nullptr),
-      dmaFlushMgmt_(device),
      managedBuffer_(*this, device.settings().stagedXferSize_ + 32 * Ki),
      writeBuffer_(device, managedBuffer_, device.settings().stagedXferSize_),
      hwRing_(0),
@@ -932,11 +876,6 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
  // because destructor calls eraseResourceList() even if create() failed
  dev().resizeResoureList(index());

-  if (index() >= GPU_MAX_COMMAND_QUEUES) {
-    // Cap the maximum number of concurrent Virtual GPUs
-    return false;
-  }
-
  // Virtual GPU will have profiling enabled
  state_.profiling_ = profiling;

@@ -2632,16 +2571,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
  // Add ISA memory object to the resource tracking list
  AddKernel(kernel);

-  bool needFlush = false;
-  // Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
-  // are in the same cmdBuffer
-  if (!state_.perfCounterEnabled_) {
-    dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize());
-    if (dmaFlushMgmt().dispatchSplitSize() != 0) {
-      needFlush = true;
-    }
-  }
-
  // Check if it is blit kernel. If it is, then check if split is needed.
  if (hsaKernel.isInternalKernel()) {
    // Calculate new group size for each submission
@@ -2737,7 +2666,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
    }

    // Update the global GPU event
-    setGpuEvent(gpuEvent, needFlush);
+    constexpr bool kNeedFLush = false;
+    setGpuEvent(gpuEvent, kNeedFLush);

    if (printfEnabled && !printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
      LogError("Couldn't read printf data from the buffer!\n");
@@ -2799,10 +2729,6 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
    if (!foundEvent) {
      state_.forceWait_ = true;
    }
-    // If we don't have any more batches, then assume GPU is idle
-    else if (cbQueue_.empty()) {
-      dmaFlushMgmt_.resetCbWorkload(dev());
-    }
  }
 }

@@ -3325,11 +3251,8 @@ void VirtualGPU::waitEventLock(CommandBatch* cb) {
      cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);

      uint64_t endTimeStampCPU = amd::Os::timeNanos();
-      // Make sure the command batch has a valid GPU TS
-      if (!GPU_RAW_TIMESTAMP) {
-        // Adjust the base time by the execution time
-        readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
-      }
+      // Adjust the base time by the execution time
+      readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
    }
  }
 }
@@ -290,36 +290,6 @@ class VirtualGPU : public device::VirtualDevice {
    size_t maxMemObjectsInQueue_;     //!< Maximum number of mem objects in the queue
  };

-  class DmaFlushMgmt : public amd::EmbeddedObject {
-   public:
-    DmaFlushMgmt(const Device& dev);
-
-    // Resets DMA command buffer workload
-    void resetCbWorkload(const Device& dev);
-
-    // Finds split size for the current dispatch
-    void findSplitSize(const Device& dev,  //!< GPU device object
-                       uint64_t threads,   //!< Total number of execution threads
-                       uint instructions   //!< Number of ALU instructions
-    );
-
-    // Returns TRUE if DMA command buffer is ready for a flush
-    bool isCbReady(VirtualGPU& gpu,   //!< Virtual GPU object
-                   uint64_t threads,  //!< Total number of execution threads
-                   uint instructions  //!< Number of ALU instructions
-    );
-
-    // Returns dispatch split size
-    uint dispatchSplitSize() const { return dispatchSplitSize_; }
-
-   private:
-    uint64_t maxDispatchWorkload_;  //!< Maximum number of operations for a single dispatch
-    uint64_t maxCbWorkload_;        //!< Maximum number of operations for DMA command buffer
-    uint64_t cbWorkload_;           //!< Current number of operations in DMA command buffer
-    uint aluCnt_;                   //!< All ALUs on the chip
-    uint dispatchSplitSize_;        //!< Dispath split size in elements
-  };
-
 public:
  VirtualGPU(Device& device);
  //! Creates virtual gpu object
@@ -470,9 +440,6 @@ class VirtualGPU : public device::VirtualDevice {
  //! Returns hsaQueueMem_
  const Memory* hsaQueueMem() const { return hsaQueueMem_; }

-  //! Returns DMA flush management structure
-  const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; }
-
  //! Returns the HW ring used on this virtual device
  uint hwRing() const { return hwRing_; }

@@ -695,8 +662,6 @@ class VirtualGPU : public device::VirtualDevice {
  TimeStampCache* tsCache_;            //!< TimeStamp cache
  MemoryDependency memoryDependency_;  //!< Memory dependency class

-  DmaFlushMgmt dmaFlushMgmt_;  //!< DMA flush management
-
  std::vector<amd::Memory*> pinnedMems_;  //!< Pinned memory list

  ManagedBuffer managedBuffer_;  //!< Managed write buffer
@@ -233,7 +233,7 @@ class NullDevice : public amd::Device {

  //! Determine if we can use device memory for SVM
  const bool forceFineGrain(amd::Memory* memory) const {
-    return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1);
+    return (memory->getContext().devices().size() > 1);
  }

  virtual bool importExtSemaphore(void** extSemahore, const amd::Os::FileDesc& handle,
@@ -36,17 +36,10 @@ Settings::Settings() {
  doublePrecision_ = ::CL_KHR_FP64;

  enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE;
-  enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM;

  maxWorkGroupSize_ = 1024;
  preferredWorkGroupSize_ = 256;

-  maxWorkGroupSize2DX_ = 16;
-  maxWorkGroupSize2DY_ = 16;
-  maxWorkGroupSize3DX_ = 4;
-  maxWorkGroupSize3DY_ = 4;
-  maxWorkGroupSize3DZ_ = 4;
-
  kernargPoolSize_ = HSA_KERNARG_POOL_SIZE;

  // Determine if user is requesting Non-Coherent mode
@@ -96,7 +89,6 @@ Settings::Settings() {
  cpu_wait_for_signal_ = (!flagIsDefault(ROC_CPU_WAIT_FOR_SIGNAL)) ?
                          ROC_CPU_WAIT_FOR_SIGNAL : cpu_wait_for_signal_;
  system_scope_signal_ = ROC_SYSTEM_SCOPE_SIGNAL;
-  skip_copy_sync_      = ROC_SKIP_COPY_SYNC;

  // Use coarse grain system memory for kernel arguments by default (to keep GPU cache)
  fgs_kernel_arg_ = false;
@@ -201,23 +193,6 @@ void Settings::override() {
    preferredWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
  }

-  if (GPU_MAX_WORKGROUP_SIZE_2D_X != 0) {
-    maxWorkGroupSize2DX_ = GPU_MAX_WORKGROUP_SIZE_2D_X;
-  }
-  if (GPU_MAX_WORKGROUP_SIZE_2D_Y != 0) {
-    maxWorkGroupSize2DY_ = GPU_MAX_WORKGROUP_SIZE_2D_Y;
-  }
-
-  if (GPU_MAX_WORKGROUP_SIZE_3D_X != 0) {
-    maxWorkGroupSize3DX_ = GPU_MAX_WORKGROUP_SIZE_3D_X;
-  }
-  if (GPU_MAX_WORKGROUP_SIZE_3D_Y != 0) {
-    maxWorkGroupSize3DY_ = GPU_MAX_WORKGROUP_SIZE_3D_Y;
-  }
-  if (GPU_MAX_WORKGROUP_SIZE_3D_Z != 0) {
-    maxWorkGroupSize3DZ_ = GPU_MAX_WORKGROUP_SIZE_3D_Z;
-  }
-
  if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
    xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
  }
@@ -43,7 +43,6 @@ class Settings : public device::Settings {
    struct {
      uint doublePrecision_ : 1;        //!< Enables double precision support
      uint enableLocalMemory_ : 1;      //!< Enable GPUVM memory
-      uint enableCoarseGrainSVM_ : 1;   //!< Enable device memory for coarse grain SVM allocations
      uint enableNCMode_ : 1;           //!< Enable Non Coherent mode for system memory
      uint imageDMA_ : 1;               //!< Enable direct image DMA transfers
      uint stagedXferRead_ : 1;         //!< Uses a staged buffer read
@@ -51,11 +50,10 @@ class Settings : public device::Settings {
      uint imageBufferWar_ : 1;         //!< Image buffer workaround for Gfx10
      uint cpu_wait_for_signal_ : 1;    //!< Wait for HSA signal on CPU
      uint system_scope_signal_ : 1;    //!< HSA signal is visibile to the entire system
-      uint skip_copy_sync_ : 1;         //!< Ignore explicit HSA signal waits for copy functionality
      uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
      uint coop_sync_ : 1;              //!< grid and multi-grid sync for gfx940+
      uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
-      uint reserved_ : 18;
+      uint reserved_ : 20;
    };
    uint value_;
  };
@@ -66,15 +64,6 @@ class Settings : public device::Settings {
  //! Preferred workgroup size
  uint preferredWorkGroupSize_;

-  //! Default max workgroup sizes for 2D
-  int maxWorkGroupSize2DX_;
-  int maxWorkGroupSize2DY_;
-
-  //! Default max workgroup sizes for 3D
-  int maxWorkGroupSize3DX_;
-  int maxWorkGroupSize3DY_;
-  int maxWorkGroupSize3DZ_;
-
  uint kernargPoolSize_;
  uint numDeviceEvents_;      //!< The number of device events
  uint numWaitEvents_;        //!< The number of wait events for device enqueue
@@ -490,7 +490,7 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
      // Check if skip wait optimization is enabled. It will try to predict the same engine in ROCr
      // and ignore the signal wait, relying on in-order engine execution
      const Settings& settings = gpu_.dev().settings();
-      if (!settings.skip_copy_sync_ && (engine != HwQueueEngine::Compute)) {
+      if (engine != HwQueueEngine::Compute) {
        explicit_wait = true;
      }
    }
@@ -30,22 +30,10 @@ release(uint, AMD_LOG_MASK, 0X7FFFFFFF,                                       \
        "The mask to enable specific kinds of logs")                          \
 debug(uint, DEBUG_GPU_FLAGS, 0,                                               \
        "The debug options for GPU device")                                   \
-release(uint, GPU_MAX_COMMAND_QUEUES, 300,                                    \
-        "The maximum number of concurrent Virtual GPUs")                      \
 release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */         \
        "The default command queue thread stack size")                        \
 release(int, GPU_MAX_WORKGROUP_SIZE, 0,                                       \
        "Maximum number of workitems in a workgroup for GPU, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0,                                  \
-        "Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0,                                  \
-        "Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0,                                  \
-        "Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0,                                  \
-        "Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \
-release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0,                                  \
-        "Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \
 debug(bool, CPU_MEMORY_GUARD_PAGES, false,                                    \
        "Use guard pages for CPU memory")                                     \
 debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64,                                 \
@@ -74,8 +62,6 @@ release(uint, GPU_BLIT_ENGINE_TYPE, 0x0,                                      \
        "Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel")       \
 release(bool, GPU_FLUSH_ON_EXECUTION, false,                                  \
        "Submit commands to HW on every operation. 0 - Disable, 1 - Enable")  \
-release(bool, GPU_USE_SYNC_OBJECTS, true,                                     \
-        "If enabled, use sync objects instead of polling")                    \
 release(bool, CL_KHR_FP64, true,                                              \
        "Enable/Disable support for double precision")                        \
 release(cstring, AMD_OCL_BUILD_OPTIONS, 0,                                    \
@@ -86,12 +72,8 @@ release(cstring, AMD_OCL_LINK_OPTIONS, 0,                                     \
        "Set clLinkProgram()'s options (override)")                           \
 release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0,                              \
        "Append clLinkProgram()'s options")                                   \
-release(cstring, AMD_OCL_SC_LIB, 0,                                           \
-        "Set shader compiler shared library name or path")                    \
 debug(cstring, AMD_OCL_SUBST_OBJFILE, 0,                                      \
        "Specify binary substitution config file for OpenCL")                 \
-debug(bool, AMD_OCL_ENABLE_MESSAGE_BOX, false,                                \
-        "Enable the error dialog on Windows")                                 \
 release(size_t, GPU_PINNED_XFER_SIZE, 32,                                     \
        "The pinned buffer size for pinning in read/write transfers in MiB")  \
 release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128,                                \
@@ -100,12 +82,6 @@ release(size_t, GPU_RESOURCE_CACHE_SIZE, 64,                                  \
        "The resource cache size in MB")                                      \
 release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096,                                  \
        "The maximum size accepted for suballocaitons in KB")                 \
-release(bool, GPU_FORCE_64BIT_PTR, 0,                                         \
-        "Forces 64 bit pointers on GPU")                                      \
-release(bool, GPU_FORCE_OCL20_32BIT, 0,                                       \
-        "Forces 32 bit apps to take CLANG\HSAIL path")                        \
-release(bool, GPU_RAW_TIMESTAMP, 0,                                           \
-        "Reports GPU raw timestamps in GPU timeline")                         \
 release(size_t, GPU_NUM_MEM_DEPENDENCY, 256,                                  \
        "Number of memory objects for dependency tracking")                   \
 release(size_t, GPU_XFER_BUFFER_SIZE, 0,                                      \
@@ -116,32 +92,20 @@ release(uint, GPU_SINGLE_ALLOC_PERCENT, 85,                                   \
        "Maximum size of a single allocation as percentage of total")         \
 release(uint, GPU_NUM_COMPUTE_RINGS, 2,                                       \
        "GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \
-release(int, GPU_SELECT_COMPUTE_RINGS_ID, -1,                                 \
-        "GPU select the compute rings ID -1 - disabled, 0 , 1,.. - the forced compute rings ID for submission") \
-release(uint, GPU_WORKLOAD_SPLIT, 22,                                         \
-        "Workload split size")                                                \
-release(bool, GPU_USE_SINGLE_SCRATCH, false,                                  \
-        "Use single scratch buffer per device instead of per HW ring")        \
 release(bool, AMD_OCL_WAIT_COMMAND, false,                                    \
        "1 = Enable a wait for every submitted command")                      \
 release(uint, GPU_PRINT_CHILD_KERNEL, 0,                                      \
        "Prints the specified number of the child kernels")                   \
 release(bool, GPU_USE_DEVICE_QUEUE, false,                                    \
        "Use a dedicated device queue for the actual submissions")            \
-release(bool, GPU_ENABLE_LARGE_ALLOCATION, true,                              \
-        "Enable >4GB single allocations")                                     \
 release(bool, AMD_THREAD_TRACE_ENABLE, true,                                  \
        "Enable thread trace extension")                                      \
-release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200),                        \
+release(uint, OPENCL_VERSION, 200,                                            \
        "Force GPU opencl verison")                                           \
 release(bool, HSA_LOCAL_MEMORY_ENABLE, true,                                  \
        "Enable HSA device local memory usage")                               \
 release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024,                             \
        "Kernarg pool size")                                                  \
-release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true,                              \
-        "Enable device memory for coarse grain SVM allocations")              \
-release(bool, GPU_IFH_MODE, false,                                            \
-        "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \
 release(bool, GPU_MIPMAP, true,                                               \
        "Enables GPU mipmap extension")                                       \
 release(uint, GPU_ENABLE_PAL, 2,                                              \
@@ -152,8 +116,6 @@ release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1,                              \
        "Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \
 release(uint, OCL_SET_SVM_SIZE, 4*16384,                                      \
        "set SVM space size for discrete GPU")                                \
-debug(uint, OCL_SYSMEM_REQUIREMENT, 2,                                        \
-        "Use flag to change the minimum requirement of system memory not to downgrade")        \
 release(uint, GPU_WAVES_PER_SIMD, 0,                                          \
        "Force the number of waves per SIMD (1-10)")                          \
 release(bool, GPU_WAVE_LIMIT_ENABLE, false,                                   \
@@ -176,10 +138,6 @@ release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "",                              \
        "File path prefix for dumping wave limiter output")                   \
 release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "",                             \
        "File path prefix for tracing wave limiter")                          \
-release(bool, OCL_CODE_CACHE_ENABLE, false,                                   \
-        "1 = Enable compiler code cache")                                     \
-release(bool, OCL_CODE_CACHE_RESET, false,                                    \
-        "1 =  Reset the compiler code cache storage")                         \
 release(bool, PAL_DISABLE_SDMA, false,                                        \
        "1 = Disable SDMA for PAL")                                           \
 release(uint, PAL_RGP_DISP_COUNT, 10000,                                      \
@@ -243,10 +201,6 @@ release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true,                                  \
        "Enable CPU wait for dependent HSA signals.")                         \
 release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true,                                  \
        "Enable system scope for signals (uses interrupts).")                 \
-release(bool, ROC_SKIP_COPY_SYNC, false,                                      \
-        "Skips copy syncs if runtime can predict the same engine.")           \
-release(bool, ROC_ENABLE_PRE_VEGA, false,                                     \
-        "Enable support of pre-vega ASICs in ROCm path")                      \
 release(bool, GPU_FORCE_QUEUE_PROFILING, false,                               \
        "Force command queue profiling by default")                           \
 release(bool, HIP_MEM_POOL_SUPPORT, false,                                    \
@@ -178,12 +178,6 @@
 #define ALWAYSINLINE
 #endif  // !_MSC_VER

-#ifdef BRAHMA
-#define IS_BRAHMA true
-#else
-#define IS_BRAHMA false
-#endif
-
 //! \endcond

 #endif  // MACROS_HPP_