From 6cc75de90f551df25b5752b1e0d75006583df026 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Wed, 29 Aug 2018 18:54:19 -0400
Subject: [PATCH] P4 to Git Change 1599699 by gandryey@gera-w8 on 2018/08/29
 18:43:02

	SWDEV-79445 - OCL generic changes and code clean-up
	- Move WaveLimiter logic to the abstract layer. PAL version was taken as the base, thus performance of GSL path can be affected by this change

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#315 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devwavelimiter.cpp#1 move/add
... //depot/stg/opencl/drivers/opencl/runtime/device/devwavelimiter.hpp#1 move/add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#598 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#331 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#133 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.cpp#15 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.hpp#11 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#107 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#64 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.cpp#8 move/delete
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.hpp#8 move/delete
---
 rocclr/runtime/device/device.hpp              |   1 +
 rocclr/runtime/device/devkernel.cpp           |  38 ++-
 rocclr/runtime/device/devkernel.hpp           |  53 +---
 .../palwavelimiter.cpp => devwavelimiter.cpp} |  34 ++-
 .../palwavelimiter.hpp => devwavelimiter.hpp} |  23 +-
 rocclr/runtime/device/gpu/gpudevice.cpp       |   1 +
 rocclr/runtime/device/gpu/gpukernel.cpp       |   8 +-
 rocclr/runtime/device/gpu/gpukernel.hpp       |  21 +-
 rocclr/runtime/device/gpu/gpuwavelimiter.cpp  | 276 ------------------
 rocclr/runtime/device/gpu/gpuwavelimiter.hpp  | 151 ----------
 rocclr/runtime/device/pal/paldevice.cpp       |   1 +
 rocclr/runtime/device/pal/palkernel.cpp       |   7 +-
 rocclr/runtime/device/pal/palkernel.hpp       |  14 +-
 13 files changed, 95 insertions(+), 533 deletions(-)
 rename rocclr/runtime/device/{pal/palwavelimiter.cpp => devwavelimiter.cpp} (80%)
 rename rocclr/runtime/device/{pal/palwavelimiter.hpp => devwavelimiter.hpp} (90%)
 delete mode 100644 rocclr/runtime/device/gpu/gpuwavelimiter.cpp
 delete mode 100644 rocclr/runtime/device/gpu/gpuwavelimiter.hpp

diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp
index ec89e63f5b..13b7ac451b 100644
--- a/rocclr/runtime/device/device.hpp
+++ b/rocclr/runtime/device/device.hpp
@@ -425,6 +425,7 @@ struct Info : public amd::EmbeddedObject {
   //! that execute in parallel. All work items from the same work group must be
   //! executed by SIMDs in the same compute unit.
   cl_uint simdPerCU_;
+  cl_uint cuPerShaderArray_;  //!< Number of CUs per shader array
   //! The maximum number of work items from the same work group that can be
   //! executed by a SIMD in parallel
   cl_uint simdWidth_;
diff --git a/rocclr/runtime/device/devkernel.cpp b/rocclr/runtime/device/devkernel.cpp
index 9c4b43f960..12ec537d31 100644
--- a/rocclr/runtime/device/devkernel.cpp
+++ b/rocclr/runtime/device/devkernel.cpp
@@ -23,7 +23,43 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
 
 namespace device {
 
- // ================================================================================================
+// ================================================================================================
+Kernel::Kernel(const amd::Device& dev, const std::string& name)
+  : dev_(dev)
+  , name_(name)
+  , signature_(nullptr)
+  , waveLimiter_(this, dev.info().cuPerShaderArray_ * dev.info().cuPerShaderArray_) {
+  // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
+  // Due to std::string not being able to be memset to 0
+  workGroupInfo_.size_ = 0;
+  workGroupInfo_.compileSize_[0] = 0;
+  workGroupInfo_.compileSize_[1] = 0;
+  workGroupInfo_.compileSize_[2] = 0;
+  workGroupInfo_.localMemSize_ = 0;
+  workGroupInfo_.preferredSizeMultiple_ = 0;
+  workGroupInfo_.privateMemSize_ = 0;
+  workGroupInfo_.scratchRegs_ = 0;
+  workGroupInfo_.wavefrontPerSIMD_ = 0;
+  workGroupInfo_.wavefrontSize_ = 0;
+  workGroupInfo_.availableGPRs_ = 0;
+  workGroupInfo_.usedGPRs_ = 0;
+  workGroupInfo_.availableSGPRs_ = 0;
+  workGroupInfo_.usedSGPRs_ = 0;
+  workGroupInfo_.availableVGPRs_ = 0;
+  workGroupInfo_.usedVGPRs_ = 0;
+  workGroupInfo_.availableLDSSize_ = 0;
+  workGroupInfo_.usedLDSSize_ = 0;
+  workGroupInfo_.availableStackSize_ = 0;
+  workGroupInfo_.usedStackSize_ = 0;
+  workGroupInfo_.compileSizeHint_[0] = 0;
+  workGroupInfo_.compileSizeHint_[1] = 0;
+  workGroupInfo_.compileSizeHint_[2] = 0;
+  workGroupInfo_.compileVecTypeHint_ = "";
+  workGroupInfo_.uniformWorkGroupSize_ = false;
+  workGroupInfo_.wavesPerSimdHint_ = 0;
+}
+
+// ================================================================================================
 bool Kernel::createSignature(
   const parameters_t& params, uint32_t numParameters,
   uint32_t version) {
diff --git a/rocclr/runtime/device/devkernel.hpp b/rocclr/runtime/device/devkernel.hpp
index 94200a92b0..59f7733fa9 100644
--- a/rocclr/runtime/device/devkernel.hpp
+++ b/rocclr/runtime/device/devkernel.hpp
@@ -7,6 +7,7 @@
 #include "platform/context.hpp"
 #include "platform/object.hpp"
 #include "platform/memory.hpp"
+#include "devwavelimiter.hpp"
 
 #if defined(WITH_LIGHTNING_COMPILER)
 namespace llvm {
@@ -37,10 +38,6 @@ class Device;
 class KernelSignature;
 class NDRange;
 
-struct ProfilingCallback : public amd::HeapObject {
-  virtual void callback(ulong duration, uint32_t waves) = 0;
-};
-
 struct KernelParameterDescriptor {
   enum {
     Value = 0,
@@ -124,39 +121,7 @@ class Kernel : public amd::HeapObject {
   };
 
   //! Default constructor
-  Kernel(const amd::Device& dev, const std::string& name)
-    : dev_(dev)
-    , name_(name)
-    , signature_(nullptr) {
-    // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
-    // Due to std::string not being able to be memset to 0
-    workGroupInfo_.size_ = 0;
-    workGroupInfo_.compileSize_[0] = 0;
-    workGroupInfo_.compileSize_[1] = 0;
-    workGroupInfo_.compileSize_[2] = 0;
-    workGroupInfo_.localMemSize_ = 0;
-    workGroupInfo_.preferredSizeMultiple_ = 0;
-    workGroupInfo_.privateMemSize_ = 0;
-    workGroupInfo_.scratchRegs_ = 0;
-    workGroupInfo_.wavefrontPerSIMD_ = 0;
-    workGroupInfo_.wavefrontSize_ = 0;
-    workGroupInfo_.availableGPRs_ = 0;
-    workGroupInfo_.usedGPRs_ = 0;
-    workGroupInfo_.availableSGPRs_ = 0;
-    workGroupInfo_.usedSGPRs_ = 0;
-    workGroupInfo_.availableVGPRs_ = 0;
-    workGroupInfo_.usedVGPRs_ = 0;
-    workGroupInfo_.availableLDSSize_ = 0;
-    workGroupInfo_.usedLDSSize_ = 0;
-    workGroupInfo_.availableStackSize_ = 0;
-    workGroupInfo_.usedStackSize_ = 0;
-    workGroupInfo_.compileSizeHint_[0] = 0;
-    workGroupInfo_.compileSizeHint_[1] = 0;
-    workGroupInfo_.compileSizeHint_[2] = 0;
-    workGroupInfo_.compileVecTypeHint_ = "";
-    workGroupInfo_.uniformWorkGroupSize_ = false;
-    workGroupInfo_.wavesPerSimdHint_ = 0;
-  }
+  Kernel(const amd::Device& dev, const std::string& name);
 
   //! Default destructor
   virtual ~Kernel();
@@ -196,13 +161,14 @@ class Kernel : public amd::HeapObject {
   size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; }
 
   //! Get profiling callback object
-  virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) {
-    return nullptr;
-  }
+  amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) {
+    return waveLimiter_.getProfilingCallback(vdev);
+  };
 
-  virtual uint getWavesPerSH(const device::VirtualDevice* vdv) const {
-      return 0;
-  }
+  //! Get waves per shader array to be used for kernel execution.
+  uint getWavesPerSH(const device::VirtualDevice* vdev) const {
+    return waveLimiter_.getWavesPerSH(vdev);
+  };
 
   //! Returns GPU device object, associated with this kernel
   const amd::Device& dev() const { return dev_; }
@@ -272,6 +238,7 @@ class Kernel : public amd::HeapObject {
   amd::KernelSignature* signature_; //!< kernel signature
   std::string buildLog_;            //!< build log
   std::vector<PrintfInfo> printf_;  //!< Format strings for GPU printf support
+  WaveLimiterManager waveLimiter_;  //!< adaptively control number of waves
 
   union Flags {
     struct {
diff --git a/rocclr/runtime/device/pal/palwavelimiter.cpp b/rocclr/runtime/device/devwavelimiter.cpp
similarity index 80%
rename from rocclr/runtime/device/pal/palwavelimiter.cpp
rename to rocclr/runtime/device/devwavelimiter.cpp
index 75cb0811cc..67d2380ffb 100644
--- a/rocclr/runtime/device/pal/palwavelimiter.cpp
+++ b/rocclr/runtime/device/devwavelimiter.cpp
@@ -1,20 +1,22 @@
 //
 // Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
 //
-#include "device/pal/palkernel.hpp"
-#include "device/pal/palwavelimiter.hpp"
+#include "platform/command.hpp"
+#include "device/devkernel.hpp"
+#include "device/devwavelimiter.hpp"
 #include "os/os.hpp"
 #include "utils/flags.hpp"
 
 #include <cstdlib>
 using namespace std;
 
-namespace pal {
+namespace device {
 
 uint WaveLimiter::MaxWave;
 uint WaveLimiter::RunCount;
 uint WaveLimiter::AdaptCount;
 
+// ================================================================================================
 WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump)
     : manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
   setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
@@ -36,12 +38,14 @@ WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable,
   numContinuousSamples_ = 0;
 }
 
+// ================================================================================================
 WaveLimiter::~WaveLimiter() {
   if (traceStream_.is_open()) {
     traceStream_.close();
   }
 }
 
+// ================================================================================================
 uint WaveLimiter::getWavesPerSH() {
   // Generate different wave counts in the adaptation mode
   if ((state_ == ADAPT) && (sampleCount_ < AdaptCount)) {
@@ -66,6 +70,7 @@ uint WaveLimiter::getWavesPerSH() {
   return waves_ * SIMDPerSH_;
 }
 
+// ================================================================================================
 WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
                                      bool enableDump)
     : WaveLimiter(manager, seqNum, enable, enableDump) {
@@ -78,8 +83,10 @@ WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, b
   clearData();
 }
 
+// ================================================================================================
 WLAlgorithmSmooth::~WLAlgorithmSmooth() {}
 
+// ================================================================================================
 void WLAlgorithmSmooth::clearData() {
   waves_ = MaxWave;
   countAll_ = 0;
@@ -88,10 +95,11 @@ void WLAlgorithmSmooth::clearData() {
   dataCount_ = 0;
 }
 
+// ================================================================================================
 void WLAlgorithmSmooth::updateData(ulong time) {
-
 }
 
+// ================================================================================================
 void WLAlgorithmSmooth::outputTrace() {
   if (!traceStream_.is_open()) {
     return;
@@ -114,7 +122,7 @@ void WLAlgorithmSmooth::outputTrace() {
   traceStream_ << "\n\n";
 }
 
-
+// ================================================================================================
 void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) {
   dumper_.addData(duration, waves, static_cast<char>(state_));
 
@@ -212,6 +220,7 @@ void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) {
   }
 }
 
+// ================================================================================================
 WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) {
   enable_ = enable;
   if (enable_) {
@@ -219,6 +228,7 @@ WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable)
   }
 }
 
+// ================================================================================================
 WaveLimiter::DataDumper::~DataDumper() {
   if (!enable_) {
     return;
@@ -232,6 +242,7 @@ WaveLimiter::DataDumper::~DataDumper() {
   OFS.close();
 }
 
+// ================================================================================================
 void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
   if (!enable_) {
     return;
@@ -242,18 +253,24 @@ void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
   state_.push_back(state);
 }
 
+// ================================================================================================
 WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH)
     : owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
-  setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH);
+  setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, ((simdPerSH == 0) ? 1 : simdPerSH));
   fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
 }
 
+// ================================================================================================
 WaveLimiterManager::~WaveLimiterManager() {
   for (auto& I : limiters_) {
     delete I.second;
   }
 }
 
+// ================================================================================================
+const std::string& WaveLimiterManager::name() const { return owner_->name(); }
+
+// ================================================================================================
 uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const {
   if (fixed_ > 0) {
     return fixed_;
@@ -291,7 +308,8 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
   return limiter;
 }
 
-void WaveLimiterManager::enable() {
+// ================================================================================================
+void WaveLimiterManager::enable(bool isSupported) {
   if (fixed_ > 0) {
     return;
   }
@@ -300,7 +318,7 @@ void WaveLimiterManager::enable() {
   // Disabled for SI due to bug #10817
   if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
     enable_ = GPU_WAVE_LIMIT_ENABLE;
-  } else {
+  } else if (isSupported) {
     if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
       enable_ = true;
     } else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
diff --git a/rocclr/runtime/device/pal/palwavelimiter.hpp b/rocclr/runtime/device/devwavelimiter.hpp
similarity index 90%
rename from rocclr/runtime/device/pal/palwavelimiter.hpp
rename to rocclr/runtime/device/devwavelimiter.hpp
index 6caea9eb79..e13aaee8c0 100644
--- a/rocclr/runtime/device/pal/palwavelimiter.hpp
+++ b/rocclr/runtime/device/devwavelimiter.hpp
@@ -3,7 +3,6 @@
 //
 #pragma once
 
-#include "platform/command.hpp"
 #include "thread/thread.hpp"
 #include <cstdio>
 #include <cstdlib>
@@ -11,11 +10,17 @@
 #include <fstream>
 #include <unordered_map>
 
+namespace amd {
+  struct ProfilingCallback : public amd::HeapObject {
+    virtual void callback(ulong duration, uint32_t waves) = 0;
+  };
+}
+
 //! \namespace pal PAL Device Implementation
-namespace pal {
+namespace device {
 
 class WaveLimiterManager;
-class HSAILKernel;
+class Kernel;
 
 // Adaptively limit the number of waves per SIMD based on kernel execution time
 class WaveLimiter : public amd::ProfilingCallback {
@@ -120,20 +125,20 @@ class WLAlgorithmSmooth : public WaveLimiter {
 // Create wave limiter for each virtual device for a kernel and manages the wave limiters.
 class WaveLimiterManager {
  public:
-  explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH);
+  explicit WaveLimiterManager(Kernel* owner, const uint simdPerSH);
   virtual ~WaveLimiterManager();
 
   //! Get waves per shader array for a specific virtual device.
-  uint getWavesPerSH(const device::VirtualDevice*) const;
+  uint getWavesPerSH(const VirtualDevice*) const;
 
   //! Provide call back function for a specific virtual device.
-  amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice*);
+  amd::ProfilingCallback* getProfilingCallback(const VirtualDevice*);
 
   //! Enable wave limiter manager by kernel metadata and flags.
-  void enable();
+  void enable(bool isSupported = true);
 
   //! Returns the kernel name
-  const std::string& name() const { return owner_->name(); }
+  const std::string& name() const;
 
   //! Get SimdPerSH.
   uint getSimdPerSH() const { return simdPerSH_; }
@@ -141,7 +146,7 @@ class WaveLimiterManager {
  private:
   device::Kernel* owner_;  // The kernel which owns this object
   uint simdPerSH_;         // Simd Per SH
-  std::unordered_map<const device::VirtualDevice*, WaveLimiter*>
+  std::unordered_map<const VirtualDevice*, WaveLimiter*>
     limiters_;            // Maps virtual device to wave limiter
   bool enable_;           // Whether the adaptation is enabled
   bool enableDump_;       // Whether the data dumper is enabled
diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp
index 2d95f4eae7..de937f6e5a 100644
--- a/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -565,6 +565,7 @@ void NullDevice::fillDeviceInfo(const CALdeviceattribs& calAttr, const gslMemInf
     info_.deviceTopology_.pcie.function = (calAttr.pciTopologyInformation & 0x07);
 
     info_.simdPerCU_ = hwInfo()->simdPerCU_;
+    info_.cuPerShaderArray_ = calAttr.numberOfCUsperShaderArray;
     info_.simdWidth_ = hwInfo()->simdWidth_;
     info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
     info_.wavefrontWidth_ = calAttr.wavefrontSize;
diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index fe88dbd499..68d7a777eb 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -810,9 +810,7 @@ bool Kernel::create(const std::string& code, const std::string& metadata, const
 
 Kernel::Kernel(const std::string& name, const Device& gpuDev, const Program& prog,
                const InitData* initData)
-    : NullKernel(name, gpuDev, prog),
-      waveLimiter_(this,
-                   dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_) {
+    : NullKernel(name, gpuDev, prog) {
   hwPrivateSize_ = 0;
   if (NULL != initData) {
     flags_ = initData->flags_;
@@ -3054,9 +3052,7 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi
       code_(NULL),
       codeSize_(0),
       hwMetaData_(NULL),
-      extraArgumentsNum_(extraArgsNum),
-      waveLimiter_(this, (prog->isNull() ? 1 : dev().getAttribs().numberOfCUsperShaderArray) *
-                       dev().hwInfo()->simdPerCU_) {
+      extraArgumentsNum_(extraArgsNum) {
   flags_.hsa_ = true;
 }
 
diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp
index 544cc9e9e7..a60ada3dad 100644
--- a/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -15,7 +15,7 @@
 #include "device/gpu/gpuvirtual.hpp"
 #include "amd_hsa_kernel_code.h"
 #include "device/gpu/gpuprintf.hpp"
-#include "device/gpu/gpuwavelimiter.hpp"
+#include "device/devwavelimiter.hpp"
 #include "hsa.h"
 
 namespace amd {
@@ -608,11 +608,6 @@ class Kernel : public NullKernel {
                       VirtualGPU::GslKernelDesc* desc  //!< Kernel descriptor
                       ) const;
 
-  //! Get profiling callback object
-  virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) {
-    return waveLimiter_.getProfilingCallback(vdev);
-  }
-
  protected:
   //! Initializes the kernel parameters for the abstraction layer
   bool initParameters();
@@ -707,8 +702,6 @@ class Kernel : public NullKernel {
 
   uint hwPrivateSize_;  //!< initial HW private size
   uint hwLocalSize_;    //!< initial HW local size
-
-  WaveLimiterManager waveLimiter_;  //!< adaptively control number of waves
 };
 
 enum HSAIL_ADDRESS_QUALIFIER {
@@ -833,16 +826,6 @@ class HSAILKernel : public device::Kernel {
   //! Returns kernel's extra argument count
   uint extraArgumentsNum() const { return extraArgumentsNum_; }
 
-  //! Get profiling callback object
-  virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) {
-    return waveLimiter_.getProfilingCallback(vdev);
-  }
-
-  //! Get waves per shader array to be used for kernel execution.
-  uint getWavesPerSH(const device::VirtualDevice* vdev) const {
-    return waveLimiter_.getWavesPerSH(vdev);
-  }
-
  private:
   //! Disable copy constructor
   HSAILKernel(const HSAILKernel&);
@@ -873,8 +856,6 @@ class HSAILKernel : public device::Kernel {
   char* hwMetaData_;  //!< SI metadata
 
   uint extraArgumentsNum_;  //! Number of extra (hidden) kernel arguments
-
-  WaveLimiterManager waveLimiter_;  //!< adaptively control number of waves
 };
 
 /*@}*/} // namespace gpu
diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp b/rocclr/runtime/device/gpu/gpuwavelimiter.cpp
deleted file mode 100644
index b110f41633..0000000000
--- a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp
+++ /dev/null
@@ -1,276 +0,0 @@
-//
-// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
-//
-
-#include "device/gpu/gpukernel.hpp"
-#include "device/gpu/gpuwavelimiter.hpp"
-#include "os/os.hpp"
-#include "utils/flags.hpp"
-
-#include <cstdlib>
-using namespace std;
-
-namespace gpu {
-
-uint WaveLimiter::MaxWave;
-uint WaveLimiter::WarmUpCount;
-uint WaveLimiter::RunCount;
-uint WLAlgorithmSmooth::AdaptCount;
-uint WLAlgorithmSmooth::AbandonThresh;
-uint WLAlgorithmSmooth::DscThresh;
-
-WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump)
-    : manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
-  setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
-  MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
-  WarmUpCount = GPU_WAVE_LIMIT_WARMUP;
-  RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
-
-  state_ = WARMUP;
-  if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
-    traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + ".txt");
-  }
-
-  waves_ = MaxWave;
-  currWaves_ = MaxWave;
-  bestWave_ = MaxWave;
-  enable_ = enable;
-}
-
-WaveLimiter::~WaveLimiter() {
-  if (traceStream_.is_open()) {
-    traceStream_.close();
-  }
-}
-
-uint WaveLimiter::getWavesPerSH() {
-  currWaves_ = waves_;
-  return waves_ * SIMDPerSH_;
-}
-
-WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
-                                     bool enableDump)
-    : WaveLimiter(manager, seqNum, enable, enableDump) {
-  AdaptCount = 2 * MaxWave + 1;
-  AbandonThresh = GPU_WAVE_LIMIT_ABANDON;
-  DscThresh = GPU_WAVE_LIMIT_DSC_THRESH;
-
-  dynRunCount_ = RunCount;
-  measure_.resize(MaxWave + 1);
-  reference_.resize(MaxWave + 1);
-  trial_.resize(MaxWave + 1);
-  ratio_.resize(MaxWave + 1);
-
-  clearData();
-}
-
-WLAlgorithmSmooth::~WLAlgorithmSmooth() {}
-
-void WLAlgorithmSmooth::clearData() {
-  waves_ = MaxWave;
-  countAll_ = 0;
-  clear(measure_);
-  clear(reference_);
-  clear(trial_);
-  clear(ratio_);
-  discontinuous_ = false;
-  dataCount_ = 0;
-}
-
-void WLAlgorithmSmooth::updateData(ulong time) {
-  auto count = dataCount_ - 1;
-  assert(count < 2 * MaxWave + 1);
-  assert(time > 0);
-  assert(currWaves_ == waves_);
-  if (count % 2 == 0) {
-    assert(waves_ == MaxWave);
-    auto pos = count / 2;
-    measure_[pos] = time;
-    if (pos > 0) {
-      auto wave = MaxWave + 1 - pos;
-      if (abs(static_cast<long>(measure_[pos - 1]) - static_cast<long>(measure_[pos])) * 100 /
-              measure_[pos] >
-          DscThresh) {
-        discontinuous_ = true;
-      }
-      reference_[wave] = (time + measure_[pos - 1]) / 2;
-      ratio_[wave] = trial_[wave] * 100 / reference_[wave];
-      if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) {
-        bestWave_ = wave;
-      }
-    }
-  } else {
-    assert(waves_ == MaxWave - count / 2);
-    trial_[waves_] = time;
-  }
-  outputTrace();
-}
-
-void WLAlgorithmSmooth::outputTrace() {
-  if (!traceStream_.is_open()) {
-    return;
-  }
-
-  traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
-               << " currWaves=" << currWaves_ << " waves=" << waves_ << " bestWave=" << bestWave_
-               << '\n';
-  output(traceStream_, "\n measure = ", measure_);
-  output(traceStream_, "\n reference = ", reference_);
-  output(traceStream_, "\n ratio = ", ratio_);
-  traceStream_ << "\n\n";
-}
-
-
-void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) {
-  dumper_.addData(duration, currWaves_, static_cast<char>(state_));
-
-  if (!enable_ || (duration == 0)) {
-    return;
-  }
-
-  countAll_++;
-
-  switch (state_) {
-    case WARMUP:
-      if (countAll_ < WarmUpCount) {
-        return;
-      }
-      state_ = ADAPT;
-      bestWave_ = MaxWave;
-      clearData();
-      return;
-    case ADAPT:
-      assert(duration > 0);
-      if (waves_ == currWaves_) {
-        dataCount_++;
-        updateData(duration);
-        waves_ = MaxWave + 1 - dataCount_ / 2;
-        if (dataCount_ == 1 || (dataCount_ < AdaptCount && !discontinuous_ &&
-                                (dataCount_ % 2 == 0 || ratio_[waves_] < AbandonThresh))) {
-          if (dataCount_ % 2 == 1) {
-            --waves_;
-          } else {
-            waves_ = MaxWave;
-          }
-          return;
-        }
-        waves_ = bestWave_;
-        if (dataCount_ >= AdaptCount) {
-          dynRunCount_ = RunCount;
-        } else {
-          dynRunCount_ = AdaptCount;
-        }
-        countAll_ = rand() % MaxWave;
-        state_ = RUN;
-      }
-      return;
-    case RUN:
-      if (countAll_ < dynRunCount_) {
-        return;
-      }
-      state_ = ADAPT;
-      bestWave_ = MaxWave;
-      clearData();
-      return;
-  }
-}
-
-WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) {
-  enable_ = enable;
-  if (enable_) {
-    fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv";
-  }
-}
-
-WaveLimiter::DataDumper::~DataDumper() {
-  if (!enable_) {
-    return;
-  }
-
-  std::ofstream OFS(fileName_);
-  for (size_t i = 0, e = time_.size(); i != e; ++i) {
-    OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' << static_cast<uint>(state_[i])
-        << '\n';
-  }
-  OFS.close();
-}
-
-void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
-  if (!enable_) {
-    return;
-  }
-
-  time_.push_back(time);
-  wavePerSIMD_.push_back(wave);
-  state_.push_back(state);
-}
-
-WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH)
-    : owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
-  setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH);
-  fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
-}
-
-WaveLimiterManager::~WaveLimiterManager() {
-  for (auto& I : limiters_) {
-    delete I.second;
-  }
-}
-
-uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const {
-  if (fixed_ > 0) {
-    return fixed_;
-  }
-  if (!enable_) {
-    return 0;
-  }
-  auto loc = limiters_.find(vdev);
-  if (loc == limiters_.end()) {
-    return 0;
-  }
-  assert(loc->second != NULL);
-  return loc->second->getWavesPerSH();
-}
-
-amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
-    const device::VirtualDevice* vdev) {
-  assert(vdev != NULL);
-  if (!enable_ && !enableDump_) {
-    return NULL;
-  }
-
-  amd::ScopedLock SL(monitor_);
-  auto loc = limiters_.find(vdev);
-  if (loc != limiters_.end()) {
-    return loc->second;
-  }
-
-  auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_);
-  if (limiter == NULL) {
-    enable_ = false;
-    return NULL;
-  }
-  limiters_[vdev] = limiter;
-  return limiter;
-}
-
-void WaveLimiterManager::enable(const bool isCiPlus) {
-  if (fixed_ > 0) {
-    return;
-  }
-
-  // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
-  // Disabled for SI due to bug #10817
-  if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
-    enable_ = GPU_WAVE_LIMIT_ENABLE;
-  } else {
-    if (isCiPlus) {
-      if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
-        enable_ = true;
-      } else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
-        fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH();
-      }
-    }
-  }
-}
-}
diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp b/rocclr/runtime/device/gpu/gpuwavelimiter.hpp
deleted file mode 100644
index 570a457d62..0000000000
--- a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-//
-// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
-//
-
-#ifndef GPUWAVELIMITER_HPP_
-#define GPUWAVELIMITER_HPP_
-
-#include "platform/command.hpp"
-#include "thread/thread.hpp"
-#include <cstdio>
-#include <cstdlib>
-#include <cstdint>
-#include <fstream>
-#include <unordered_map>
-
-//! \namespace gpu GPU Device Implementation
-namespace gpu {
-
-class WaveLimiterManager;
-
-// Adaptively limit the number of waves per SIMD based on kernel execution time
-class WaveLimiter : public amd::ProfilingCallback {
- public:
-  explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
-  virtual ~WaveLimiter();
-
-  //! Get waves per shader array to be used for kernel execution.
-  uint getWavesPerSH();
-
- protected:
-  enum StateKind { WARMUP, ADAPT, RUN };
-
-  class DataDumper {
-   public:
-    explicit DataDumper(const std::string& kernelName, bool enable);
-    ~DataDumper();
-
-    //! Record execution time, waves/simd and state of wave limiter.
-    void addData(ulong time, uint wave, char state);
-
-    //! Whether this data dumper is enabled.
-    bool enabled() const { return enable_; }
-
-   private:
-    bool enable_;
-    std::string fileName_;
-    std::vector<ulong> time_;
-    std::vector<uint> wavePerSIMD_;
-    std::vector<char> state_;
-  };
-
-  std::vector<uint64_t> measure_;
-  bool enable_;
-  uint SIMDPerSH_;  // Number of SIMDs per SH
-  uint waves_;      // Waves per SIMD to be set
-  uint bestWave_;   // Optimal waves per SIMD
-  uint countAll_;   // Number of kernel executions
-  StateKind state_;
-  WaveLimiterManager* manager_;
-  DataDumper dumper_;
-  std::ofstream traceStream_;
-  uint currWaves_;  // Current waves per SIMD
-
-  static uint MaxWave;      // Maximum number of waves per SIMD
-  static uint WarmUpCount;  // Number of kernel executions for warm up
-  static uint RunCount;     // Number of kernel executions for normal run
-
-  //! Call back from Event::recordProfilingInfo to get execution time.
-  virtual void callback(ulong duration, uint32_t waves) = 0;
-
-  //! Output trace of measurement/adaptation.
-  virtual void outputTrace() = 0;
-
-  template <class T> void clear(T& A) {
-    for (auto& I : A) {
-      I = 0;
-    }
-  }
-  template <class T> void output(std::ofstream& ofs, const std::string& prompt, T& A) {
-    ofs << prompt;
-    for (auto& I : A) {
-      ofs << ' ' << static_cast<ulong>(I);
-    }
-  }
-};
-
-class WLAlgorithmSmooth : public WaveLimiter {
- public:
-  explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
-                             bool enableDump);
-  virtual ~WLAlgorithmSmooth();
-
- private:
-  std::vector<uint64_t> reference_;
-  std::vector<uint64_t> trial_;
-  std::vector<uint64_t> ratio_;
-  bool discontinuous_;  // Measured data is discontinuous
-  uint dynRunCount_;
-  uint dataCount_;
-
-  static uint AdaptCount;     // Number of kernel executions for adapting
-  static uint AbandonThresh;  // Threshold to abandon adaptation
-  static uint DscThresh;      // Threshold for identifying discontinuities
-
-  //! Update measurement data and optimal waves/simd with execution time.
-  void updateData(ulong time);
-
-  //! Clear measurement data for the next adaptation.
-  void clearData();
-
-  //! Call back from Event::recordProfilingInfo to get execution time.
-  void callback(ulong duration, uint32_t waves);
-
-  //! Output trace of measurement/adaptation.
-  void outputTrace();
-};
-
-// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
-class WaveLimiterManager {
- public:
-  explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH);
-  virtual ~WaveLimiterManager();
-
-  //! Get waves per shader array for a specific virtual device.
-  uint getWavesPerSH(const device::VirtualDevice*) const;
-
-  //! Provide call back function for a specific virtual device.
-  amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice*);
-
-  //! Enable wave limiter manager by kernel metadata and flags.
-  void enable(const bool isCiPlus);
-
-  //! Returns the kernel name
-  const std::string& name() const { return owner_->name(); }
-
-  //! Get SimdPerSH.
-  uint getSimdPerSH() const { return simdPerSH_; }
-
- private:
-  device::Kernel* owner_;  // The kernel which owns this object
-  uint simdPerSH_;         // Simd Per SH
-  std::unordered_map<const device::VirtualDevice*,
-                     WaveLimiter*>
-      limiters_;          // Maps virtual device to wave limiter
-  bool enable_;           // Whether the adaptation is enabled
-  bool enableDump_;       // Whether the data dumper is enabled
-  uint fixed_;            // The fixed waves/simd value if not zero
-  amd::Monitor monitor_;  // The mutex for updating the wave limiter map
-};
-}
-#endif
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 5d0a0bcd03..aaa4987605 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -574,6 +574,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
     info_.deviceTopology_.pcie.function = palProp.pciProperties.functionNumber;
 
     info_.simdPerCU_ = hwInfo()->simdPerCU_;
+    info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray;
     info_.simdWidth_ = hwInfo()->simdWidth_;
     info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
     info_.wavefrontWidth_ = palProp.gfxipProperties.shaderCore.wavefrontSize;
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index 7c330fbbd4..fbfe429231 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -74,12 +74,7 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi
       prog_(*prog),
       index_(0),
       code_(0),
-      codeSize_(0),
-      waveLimiter_(
-          this,
-          (prog->isNull() ? 1
-                          : dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) *
-              dev().hwInfo()->simdPerCU_) {
+      codeSize_(0) {
   flags_.hsa_ = true;
 }
 
diff --git a/rocclr/runtime/device/pal/palkernel.hpp b/rocclr/runtime/device/pal/palkernel.hpp
index 43fc6ff185..c7477697c0 100644
--- a/rocclr/runtime/device/pal/palkernel.hpp
+++ b/rocclr/runtime/device/pal/palkernel.hpp
@@ -13,7 +13,7 @@
 #include "device/pal/palvirtual.hpp"
 #include "amd_hsa_kernel_code.h"
 #include "device/pal/palprintf.hpp"
-#include "device/pal/palwavelimiter.hpp"
+#include "device/devwavelimiter.hpp"
 #include "hsa.h"
 
 #if defined(WITH_LIGHTNING_COMPILER)
@@ -98,16 +98,6 @@ class HSAILKernel : public device::Kernel {
   //! Returns the kernel index in the program
   uint index() const { return index_; }
 
-  //! Get profiling callback object
-  virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) {
-    return waveLimiter_.getProfilingCallback(vdev);
-  };
-
-  //! Get waves per shader array to be used for kernel execution.
-  virtual uint getWavesPerSH(const device::VirtualDevice* vdev) const {
-    return waveLimiter_.getWavesPerSH(vdev);
-  };
-
  private:
   //! Disable copy constructor
   HSAILKernel(const HSAILKernel&);
@@ -126,8 +116,6 @@ class HSAILKernel : public device::Kernel {
 
   uint64_t code_;    //!< GPU memory pointer to the kernel
   size_t codeSize_;  //!< Size of ISA code
-
-  WaveLimiterManager waveLimiter_;  //!< adaptively control number of waves
 };
 
 #if defined(WITH_LIGHTNING_COMPILER)