SWDEV-407533 - [ABI Break]Remove Wavelimiter

Change-Id: I6a2f6fb5a0c3acea93fa0200a69679783e76f5bd [ROCm/clr commit: 7be3a5e33e]
2023-08-24 18:29:40 -04:00
@@ -82,7 +82,6 @@ class EventMarker : public amd::Marker {
              int32_t scope = amd::Device::kCacheStateInvalid)
      : amd::Marker(stream, disableFlush) {
    profilingInfo_.enabled_ = true;
-    profilingInfo_.callback_ = nullptr;
    profilingInfo_.marker_ts_ = markerTs;
    profilingInfo_.clear();
    setEventScope(scope);
@@ -70,7 +70,6 @@ target_sources(rocclr PRIVATE
  ${ROCCLR_SRC_DIR}/device/device.cpp
  ${ROCCLR_SRC_DIR}/device/devkernel.cpp
  ${ROCCLR_SRC_DIR}/device/devprogram.cpp
-  ${ROCCLR_SRC_DIR}/device/devwavelimiter.cpp
  ${ROCCLR_SRC_DIR}/device/hsailctx.cpp
  ${ROCCLR_SRC_DIR}/elf/elf.cpp
  ${ROCCLR_SRC_DIR}/os/alloc.cpp
@@ -589,8 +589,7 @@ Kernel::Kernel(const amd::Device& dev, const std::string& name, const Program& p
  : dev_(dev)
  , name_(name)
  , prog_(prog)
-  , signature_(nullptr)
-  , waveLimiter_(this, dev.info().cuPerShaderArray_ * dev.info().simdPerCU_) {
+  , signature_(nullptr) {
  // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
  // Due to std::string not being able to be memset to 0
  workGroupInfo_.size_ = 0;
@@ -26,7 +26,6 @@
 #include "platform/context.hpp"
 #include "platform/object.hpp"
 #include "platform/memory.hpp"
-#include "devwavelimiter.hpp"

 namespace amd {
 class Device;
@@ -435,16 +434,6 @@ class Kernel : public amd::HeapObject {

  size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; }

-  //! Get profiling callback object
-  amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) {
-    return waveLimiter_.getProfilingCallback(vdev);
-  };
-
-  //! Get waves per shader array to be used for kernel execution.
-  uint getWavesPerSH(const device::VirtualDevice* vdev) const {
-    return waveLimiter_.getWavesPerSH(vdev);
-  };
-
  //! Returns GPU device object, associated with this kernel
  const amd::Device& device() const { return dev_; }

@@ -567,7 +556,6 @@ class Kernel : public amd::HeapObject {
  amd::KernelSignature* signature_; //!< kernel signature
  std::string buildLog_;            //!< build log
  std::vector<PrintfInfo> printf_;  //!< Format strings for GPU printf support
-  WaveLimiterManager waveLimiter_;  //!< adaptively control number of waves
  std::string runtimeHandle_;       //!< Runtime handle for context loader

  uint64_t kernelCodeHandle_ = 0;   //!< Kernel code handle (aka amd_kernel_code_t)
@@ -26,7 +26,6 @@
 #include "platform/context.hpp"
 #include "platform/object.hpp"
 #include "platform/memory.hpp"
-#include "devwavelimiter.hpp"

 #if defined(USE_COMGR_LIBRARY)
 #include "amd_comgr/amd_comgr.h"
@@ -1,348 +0,0 @@
-/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE. */
-
-#include "platform/command.hpp"
-#include "device/devkernel.hpp"
-#include "device/devwavelimiter.hpp"
-#include "os/os.hpp"
-#include "utils/flags.hpp"
-
-#include <cstdlib>
-using namespace std;
-
-namespace device {
-
-uint WaveLimiter::MaxWave;
-uint WaveLimiter::RunCount;
-uint WaveLimiter::AdaptCount;
-
-// ================================================================================================
-WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump)
-    : manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
-  setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
-  MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
-  RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
-  AdaptCount = MaxContinuousSamples * 2 * (MaxWave + 1);
-
-  state_ = WARMUP;
-  if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
-    traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + ".txt");
-  }
-
-  waves_ = MaxWave;
-  enable_ = (SIMDPerSH_ == 0) ? false : enable;
-  bestWave_ = (enable_) ? MaxWave : 0;
-  worstWave_ = 0;
-  sampleCount_ = 0;
-  resultCount_ = 0;
-  numContinuousSamples_ = 0;
-}
-
-// ================================================================================================
-WaveLimiter::~WaveLimiter() {
-  if (traceStream_.is_open()) {
-    traceStream_.close();
-  }
-}
-
-// ================================================================================================
-uint WaveLimiter::getWavesPerSH() {
-  // Generate different wave counts in the adaptation mode
-  if ((state_ == ADAPT) && (sampleCount_ < AdaptCount)) {
-    if (numContinuousSamples_ == 0) {
-        ++waves_;
-        waves_ %= MaxWave + 1;
-        // Don't execute the wave count with the worst performance
-        if (waves_ != 0) {
-          while (worstWave_ >= waves_) {
-            ++waves_;
-            waves_ %= MaxWave + 1;
-          }
-        }
-    }
-    ++numContinuousSamples_;
-    numContinuousSamples_ %= MaxContinuousSamples;
-    ++sampleCount_;
-  }
-  else {
-    waves_ = bestWave_;
-  }
-  return waves_ * SIMDPerSH_;
-}
-
-// ================================================================================================
-WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
-                                     bool enableDump)
-    : WaveLimiter(manager, seqNum, enable, enableDump) {
-  dynRunCount_ = RunCount;
-  adpMeasure_.resize(MaxWave + 1);
-  adpSampleCnt_.resize(MaxWave + 1);
-  runMeasure_.resize(MaxWave + 1);
-  runSampleCnt_.resize(MaxWave + 1);
-
-  clearData();
-}
-
-// ================================================================================================
-WLAlgorithmSmooth::~WLAlgorithmSmooth() {}
-
-// ================================================================================================
-void WLAlgorithmSmooth::clearData() {
-  waves_ = MaxWave;
-  countAll_ = 0;
-  clear(adpMeasure_);
-  clear(adpSampleCnt_);
-  dataCount_ = 0;
-}
-
-// ================================================================================================
-void WLAlgorithmSmooth::updateData(ulong time) {
-}
-
-// ================================================================================================
-void WLAlgorithmSmooth::outputTrace() {
-  if (!traceStream_.is_open()) {
-    return;
-  }
-
-  traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ <<
-    " waves=" << waves_ << " bestWave=" << bestWave_ << " worstWave=" << worstWave_ << '\n';
-  output(traceStream_, "\n adaptive measure = ", adpMeasure_);
-  output(traceStream_, "\n adaptive sample count = ", adpSampleCnt_);
-  output(traceStream_, "\n run measure = ", runMeasure_);
-  output(traceStream_, "\n run sample count = ", runSampleCnt_);
-  traceStream_ << "\n % time from the previous runs to the best wave: ";
-  float min = static_cast<float>(adpMeasure_[bestWave_]) / adpSampleCnt_[bestWave_];
-  for (uint i = 0; i < (MaxWave + 1); ++i) {
-    runSampleCnt_[i] = (runSampleCnt_[i] == 0) ? 1 : runSampleCnt_[i];
-    float average = static_cast<float>(runMeasure_[i]) / runSampleCnt_[i];
-    traceStream_ << (average * 100 / min) << " ";
-  }
-  traceStream_ << "\n run count = " << dynRunCount_;
-  traceStream_ << "\n\n";
-}
-
-// ================================================================================================
-void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) {
-  dumper_.addData(duration, waves, static_cast<char>(state_));
-
-  if (!enable_ || (duration == 0)) {
-    return;
-  }
-
-  countAll_++;
-
-  waves /= SIMDPerSH_;
-  // Collect the time for the current wave count
-  runMeasure_[waves] += duration;
-  runSampleCnt_[waves]++;
-
-  switch (state_) {
-    case ADAPT:
-      assert(duration > 0);
-      // Wave count 0 indicates the satrt of adaptation
-      if ((waves == 0) || (resultCount_ > 0)) {
-        // Scale time to us
-        adpMeasure_[waves] += duration;
-        adpSampleCnt_[waves]++;
-        resultCount_++;
-        // If the end of adaptation is reached, then analyze the results
-        if (resultCount_ == AdaptCount) {
-          // Reset the counters
-          resultCount_ = sampleCount_ = 0;
-          float min = std::numeric_limits<float>::max();
-          float max = std::numeric_limits<float>::min();
-          uint32_t best = bestWave_;
-          // Check performance for the previous run if it's available
-          if (runSampleCnt_[bestWave_] > 0) {
-            min = static_cast<float>(runMeasure_[bestWave_]) / runSampleCnt_[bestWave_];
-          }
-          else if (adpSampleCnt_[MaxWave] > 0) {
-            min = static_cast<float>(adpMeasure_[MaxWave]) / adpSampleCnt_[MaxWave];
-            bestWave_ = MaxWave;
-          }
-          // Find the fastest average time
-          float reference = min;
-          for (uint i = MaxWave; i > 0; --i) {
-            float average;
-            if (adpSampleCnt_[i] > 0) {
-              average = static_cast<float>(adpMeasure_[i]) / adpSampleCnt_[i];
-            }
-            else {
-              average = 0.0f;
-            }
-            // More waves have 5% advantage over the lower number
-            if (average * 1.05f < min) {
-              min = average;
-              bestWave_ = i;
-            }
-            if (average > max) {
-              max = average;
-              worstWave_ = i;
-            }
-          }
-          // Check for 5% acceptance
-          if ((min * 1.05f > reference) || (bestWave_ == best)) {
-            bestWave_ = best;
-            // Increase the run time if the same wave count is the best
-            dynRunCount_ += RunCount;
-            dynRunCount_++;
-          }
-          else {
-            dynRunCount_ = RunCount;
-          }
-          // Find the middle between the best and the worst
-          if (worstWave_ < bestWave_) {
-            worstWave_ += ((bestWave_ - worstWave_) >> 1);
-          } else {
-            worstWave_ = 0;
-          }
-          state_ = RUN;
-          outputTrace();
-          // Start to collect the new data for the best wave
-          countAll_ = 0;
-          runMeasure_[bestWave_] = 0;
-          runSampleCnt_[bestWave_] = 0;
-        }
-      }
-      return;
-    case WARMUP:
-    case RUN:
-      if (countAll_ < dynRunCount_) {
-        return;
-      }
-      if (state_ == WARMUP) {
-        runSampleCnt_[bestWave_] = 0;
-      }
-      state_ = ADAPT;
-      clearData();
-      return;
-  }
-}
-
-// ================================================================================================
-WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) {
-  enable_ = enable;
-  if (enable_) {
-    fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv";
-  }
-}
-
-// ================================================================================================
-WaveLimiter::DataDumper::~DataDumper() {
-  if (!enable_) {
-    return;
-  }
-
-  std::ofstream OFS(fileName_);
-  for (size_t i = 0, e = time_.size(); i != e; ++i) {
-    OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' << static_cast<uint>(state_[i])
-        << '\n';
-  }
-  OFS.close();
-}
-
-// ================================================================================================
-void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
-  if (!enable_) {
-    return;
-  }
-
-  time_.push_back(time);
-  wavePerSIMD_.push_back(wave);
-  state_.push_back(state);
-}
-
-// ================================================================================================
-WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH)
-    : owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
-  setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, ((simdPerSH == 0) ? 1 : simdPerSH));
-  fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
-}
-
-// ================================================================================================
-WaveLimiterManager::~WaveLimiterManager() {
-  for (auto& I : limiters_) {
-    delete I.second;
-  }
-}
-
-// ================================================================================================
-const std::string& WaveLimiterManager::name() const { return owner_->name(); }
-
-// ================================================================================================
-uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const {
-  if (fixed_ > 0) {
-    return fixed_;
-  }
-  if (!enable_) {
-    return 0;
-  }
-  auto loc = limiters_.find(vdev);
-  if (loc == limiters_.end()) {
-    return 0;
-  }
-  assert(loc->second != nullptr);
-  return loc->second->getWavesPerSH();
-}
-
-amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
-    const device::VirtualDevice* vdev) {
-  assert(vdev != nullptr);
-  if (!enable_ && !enableDump_) {
-    return nullptr;
-  }
-
-  amd::ScopedLock SL(monitor_);
-  auto loc = limiters_.find(vdev);
-  if (loc != limiters_.end()) {
-    return loc->second;
-  }
-
-  auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_);
-  if (limiter == nullptr) {
-    enable_ = false;
-    return nullptr;
-  }
-  limiters_[vdev] = limiter;
-  return limiter;
-}
-
-// ================================================================================================
-void WaveLimiterManager::enable(bool isSupported) {
-  if (fixed_ > 0) {
-    enable_ = GPU_WAVE_LIMIT_ENABLE;
-    return;
-  }
-
-  // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
-  // Disabled for SI due to bug #10817
-  if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
-    enable_ = GPU_WAVE_LIMIT_ENABLE;
-  } else if (isSupported) {
-    if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
-      enable_ = true;
-    } else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
-      fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH();
-    }
-  }
-}
-
-}  // namespace pal
@@ -1,173 +0,0 @@
-/* Copyright (c) 2008 - 2021 Advanced Micro Devices, Inc.
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE. */
-
-#pragma once
-
-#include "thread/thread.hpp"
-#include <cstdio>
-#include <cstdlib>
-#include <cstdint>
-#include <fstream>
-#include <unordered_map>
-
-namespace amd {
-  struct ProfilingCallback : public amd::HeapObject {
-    virtual void callback(ulong duration, uint32_t waves) = 0;
-  };
-}
-
-//! \namespace pal PAL Device Implementation
-namespace device {
-
-class WaveLimiterManager;
-class Kernel;
-
-// Adaptively limit the number of waves per SIMD based on kernel execution time
-class WaveLimiter : public amd::ProfilingCallback {
- public:
-  explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
-  virtual ~WaveLimiter();
-
-  //! Get waves per shader array to be used for kernel execution.
-  uint getWavesPerSH();
-
- protected:
-  enum StateKind { WARMUP, ADAPT, RUN };
-
-  class DataDumper {
-   public:
-    explicit DataDumper(const std::string& kernelName, bool enable);
-    ~DataDumper();
-
-    //! Record execution time, waves/simd and state of wave limiter.
-    void addData(ulong time, uint wave, char state);
-
-    //! Whether this data dumper is enabled.
-    bool enabled() const { return enable_; }
-
-   private:
-    bool enable_;
-    std::string fileName_;
-    std::vector<ulong> time_;
-    std::vector<uint> wavePerSIMD_;
-    std::vector<char> state_;
-  };
-
-  bool enable_;
-  uint SIMDPerSH_;  // Number of SIMDs per SH
-  uint waves_;      // Waves per SIMD to be set
-  uint bestWave_;   // Optimal waves per SIMD
-  uint worstWave_;  // Wave number with the worst performance
-  uint countAll_;   // Number of kernel executions
-  StateKind state_;
-  WaveLimiterManager* manager_;
-  DataDumper dumper_;
-  std::ofstream traceStream_;
-  uint32_t sampleCount_;            //!< The number of samples for adaptive mode
-  uint32_t resultCount_;            //!< The number of results for adaptive mode
-  uint32_t numContinuousSamples_;   //!< The number of samples with the same wave count
-
-  static uint MaxWave;      // Maximum number of waves per SIMD
-  static uint RunCount;     // Number of kernel executions for normal run
-  static uint AdaptCount;   // Number of kernel executions for adapting
-  static constexpr uint MaxContinuousSamples = 2;
-
-  //! Call back from Event::recordProfilingInfo to get execution time.
-  virtual void callback(ulong duration, uint32_t waves) = 0;
-
-  //! Output trace of measurement/adaptation.
-  virtual void outputTrace() = 0;
-
-  template <class T> void clear(T& A) {
-    uint idx = 0;
-    for (auto& I : A) {
-      if (idx > worstWave_) {
-        I = 0;
-      }
-      ++idx;
-    }
-  }
-  template <class T> void output(std::ofstream& ofs, const std::string& prompt, T& A) {
-    ofs << prompt;
-    for (auto& I : A) {
-      ofs << ' ' << static_cast<ulong>(I);
-    }
-  }
-};
-
-class WLAlgorithmSmooth : public WaveLimiter {
- public:
-  explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
-                             bool enableDump);
-  virtual ~WLAlgorithmSmooth();
-
- private:
-  std::vector<uint64_t> adpMeasure_;    //!< Accumulated performance in the adaptation mode
-  std::vector<uint32_t> adpSampleCnt_;  //!< The number of samples in the adaptation mode
-  std::vector<uint64_t> runMeasure_;    //!< Accumulated performance in the run mode
-  std::vector<uint32_t> runSampleCnt_;  //!< The number of samples in the run mode
-  uint dynRunCount_;
-  uint dataCount_;
-
-  //! Update measurement data and optimal waves/simd with execution time.
-  void updateData(ulong time);
-
-  //! Clear measurement data for the next adaptation.
-  void clearData();
-
-  //! Call back from Event::recordProfilingInfo to get execution time.
-  void callback(ulong duration, uint32_t waves) override;
-
-  //! Output trace of measurement/adaptation.
-  void outputTrace() override;
-};
-
-// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
-class WaveLimiterManager {
- public:
-  explicit WaveLimiterManager(Kernel* owner, const uint simdPerSH);
-  virtual ~WaveLimiterManager();
-
-  //! Get waves per shader array for a specific virtual device.
-  uint getWavesPerSH(const VirtualDevice*) const;
-
-  //! Provide call back function for a specific virtual device.
-  amd::ProfilingCallback* getProfilingCallback(const VirtualDevice*);
-
-  //! Enable wave limiter manager by kernel metadata and flags.
-  void enable(bool isSupported = true);
-
-  //! Returns the kernel name
-  const std::string& name() const;
-
-  //! Get SimdPerSH.
-  uint getSimdPerSH() const { return simdPerSH_; }
-
- private:
-  device::Kernel* owner_;  // The kernel which owns this object
-  uint simdPerSH_;         // Simd Per SH
-  std::unordered_map<const VirtualDevice*, WaveLimiter*>
-    limiters_;            // Maps virtual device to wave limiter
-  bool enable_;           // Whether the adaptation is enabled
-  bool enableDump_;       // Whether the data dumper is enabled
-  uint fixed_;            // The fixed waves/simd value if not zero
-  amd::Monitor monitor_;  // The mutex for updating the wave limiter map
-};
-}
@@ -224,8 +224,6 @@ bool HSAILKernel::init() {
    return false;
  }

-  waveLimiter_.enable();
-
  size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
  error = amd::Hsail::QueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
                                openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
@@ -30,7 +30,6 @@
 #include "device/pal/palvirtual.hpp"
 #include "amd_hsa_kernel_code.h"
 #include "device/pal/palprintf.hpp"
-#include "device/devwavelimiter.hpp"
 #include "hsa.h"

 namespace amd {
@@ -2473,8 +2473,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {

    // Submit kernel to HW
    if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
-                                     &vcmd.event(), vcmd.sharedMemBytes(),
-                                     vcmd.cooperativeGroups())) {
+                                     vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }

@@ -2489,7 +2488,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
    profilingBegin(vcmd);

    // Submit kernel to HW
-    if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(),
+    if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
                              vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }
@@ -2499,9 +2498,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
 }

 // ================================================================================================
-bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
-                                      const_address parameters, bool nativeMem,
-                                      amd::Event* enqueueEvent, uint32_t sharedMemBytes,
+bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
+                                      const amd::Kernel& kernel, const_address parameters,
+                                      bool nativeMem, uint32_t sharedMemBytes,
                                      bool cooperativeGroup) {
  size_t newOffset[3] = {0, 0, 0};
  size_t newGlobalSize[3] = {0, 0, 0};
@@ -2648,7 +2647,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
    }
    dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
    dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
-    dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
+    dispatchParam.wavesPerSh = 0;
    dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
    dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
 #ifdef PAL_DEBUGGER
@@ -311,7 +311,6 @@ class VirtualGPU : public device::VirtualDevice {
      const amd::Kernel& kernel,           //!< Kernel for execution
      const_address parameters,            //!< Parameters for the kernel
      bool nativeMem = true,               //!< Native memory objects
-      amd::Event* enqueueEvent = nullptr,  //!< Event provided in the enqueue kernel command
      uint32_t sharedMemBytes = 0,         //!< Shared memory size
      bool cooperativeGroups = false       //!< TRUE if cooperative groups mode is required
  );
@@ -101,10 +101,6 @@ uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) {
      break;
    default:
      profilingInfo_.end_ = timeStamp;
-      if (profilingInfo_.callback_ != nullptr) {
-        profilingInfo_.callback_->callback(timeStamp - profilingInfo_.start_,
-            profilingInfo_.waves_);
-      }
      break;
  }
  return timeStamp;
@@ -429,15 +425,12 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList
    firstDevice_(firstDevice) {
  auto& device = queue.device();
  auto devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(device));
-  profilingInfo_.setCallback(devKernel->getProfilingCallback(
-    queue.vdev()), devKernel->getWavesPerSH(queue.vdev()));
  if (cooperativeGroups()) {
    setNumWorkgroups();
  }
  if (forceProfiling) {
    profilingInfo_.enabled_ = true;
    profilingInfo_.clear();
-    profilingInfo_.callback_ = nullptr;
    profilingInfo_.marker_ts_ = true;
  }
  kernel_.retain();
@@ -104,10 +104,9 @@ class Event : public RuntimeObject {
  static const EventWaitList nullWaitList;

  struct ProfilingInfo {
-    ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(false) {
+    ProfilingInfo(bool enabled = false) : enabled_(enabled), marker_ts_(false) {
      if (enabled) {
        clear();
-        callback_ = nullptr;
        correlation_id_ = activity_prof::correlation_id;
      }
    }
@@ -116,11 +115,9 @@ class Event : public RuntimeObject {
    uint64_t submitted_;
    uint64_t start_;
    uint64_t end_;
-    bool enabled_;        //!< Profiling enabled for the wave limiter
-    uint32_t waves_;      //!< The number of waves used in a dispatch
-    ProfilingCallback* callback_;
    uint64_t correlation_id_;
-    bool marker_ts_;      //!< TS marker
+    bool enabled_;    //!< Profiling enabled for the wave limiter
+    bool marker_ts_;  //!< TS marker

    void clear() {
      queued_ = 0ULL;
@@ -128,15 +125,6 @@ class Event : public RuntimeObject {
      start_ = 0ULL;
      end_ = 0ULL;
    }
-    void setCallback(ProfilingCallback* callback, uint32_t waves) {
-      if (callback == NULL) {
-        return;
-      }
-      enabled_ = true;
-      waves_ = waves;
-      clear();
-      callback_ = callback;
-    }
  } profilingInfo_;

  //! Construct a new event.
@@ -163,7 +151,6 @@ class Event : public RuntimeObject {
  void EnableProfiling() {
    profilingInfo_.enabled_ = true;
    profilingInfo_.clear();
-    profilingInfo_.callback_ = nullptr;
    profilingInfo_.correlation_id_ = activity_prof::correlation_id;
  }

@@ -118,8 +118,6 @@ release(uint, OCL_SET_SVM_SIZE, 4*16384,                                      \
        "set SVM space size for discrete GPU")                                \
 release(uint, GPU_WAVES_PER_SIMD, 0,                                          \
        "Force the number of waves per SIMD (1-10)")                          \
-release(bool, GPU_WAVE_LIMIT_ENABLE, false,                                   \
-        "1 = Enable adaptive wave limiter")                                   \
 release(bool, OCL_STUB_PROGRAMS, false,                                       \
        "1 = Enables OCL programs stubing")                                   \
 release(bool, GPU_ANALYZE_HANG, false,                                        \
@@ -128,16 +126,6 @@ release(uint, GPU_MAX_REMOTE_MEM_SIZE, 2,                                     \
        "Maximum size (in Ki) that allows device memory substitution with system") \
 release(bool, GPU_ADD_HBCC_SIZE, false,                                        \
        "Add HBCC size to the reported device memory")                        \
-release_on_stg(uint, GPU_WAVE_LIMIT_CU_PER_SH, 0,                             \
-        "Assume the number of CU per SH for wave limiter")                    \
-release_on_stg(uint, GPU_WAVE_LIMIT_MAX_WAVE, 10,                             \
-        "Set maximum waves per SIMD to try for wave limiter")                 \
-release_on_stg(uint, GPU_WAVE_LIMIT_RUN, 20,                                  \
-        "Set running factor for wave limiter")                                \
-release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "",                              \
-        "File path prefix for dumping wave limiter output")                   \
-release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "",                             \
-        "File path prefix for tracing wave limiter")                          \
 release(bool, PAL_DISABLE_SDMA, false,                                        \
        "1 = Disable SDMA for PAL")                                           \
 release(uint, PAL_RGP_DISP_COUNT, 10000,                                      \