From 19ecdf3fd126c0ebd8d9e253155cb0434654d500 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Thu, 15 Oct 2015 10:21:27 -0400
Subject: [PATCH] P4 to Git Change 1200206 by jatang@jatang-opencl-hsa-stg1 on
 2015/10/15 10:08:51

	SWDEV-56468 - Support wave limiter in HSAIL path.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#302 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#120 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#386 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.cpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.hpp#6 edit
---
 rocclr/runtime/device/gpu/gpukernel.cpp      | 15 +++++-
 rocclr/runtime/device/gpu/gpukernel.hpp      | 13 +++++
 rocclr/runtime/device/gpu/gpuvirtual.cpp     |  2 +-
 rocclr/runtime/device/gpu/gpuwavelimiter.cpp | 51 ++++++++------------
 rocclr/runtime/device/gpu/gpuwavelimiter.hpp | 24 ++++++---
 5 files changed, 63 insertions(+), 42 deletions(-)

diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index b970cedada..d106097777 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -826,7 +826,7 @@ Kernel::create(
 
     // Wave limiter needs to be initialized after kernel metadata is parsed
     // Since it depends on it.
-    waveLimiter_.enable();
+    waveLimiter_.enable(dev().settings().ciPlus_);
 
     if (result) {
         buildError_ = CL_SUCCESS;
@@ -844,7 +844,7 @@ Kernel::Kernel(
     const Program&      prog,
     const InitData*     initData)
     : NullKernel(name, gpuDev, prog)
-    , waveLimiter_(this)
+    , waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_)
 {
     hwPrivateSize_ = 0;
     if (NULL != initData) {
@@ -3371,6 +3371,7 @@ HSAILKernel::HSAILKernel(std::string name,
     , codeSize_(0)
     , hwMetaData_(NULL)
     , extraArgumentsNum_(extraArgsNum)
+    , waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_)
 {
     hsa_ = true;
 }
@@ -3517,6 +3518,16 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
     }
     index_ = md.kernel_index;
 
+    size_t sizeOfWavesPerSimdHint;
+    error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
+        RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(),
+        &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint);
+    if (error != ACL_SUCCESS) {
+        return false;
+    }
+
+    waveLimiter_.enable(dev().settings().ciPlus_);
+
     return true;
 }
 
diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp
index 05db79aad9..a516639a85 100644
--- a/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -918,6 +918,17 @@ public:
     //! Returns kernel's extra argument count
     uint extraArgumentsNum() const { return extraArgumentsNum_; }
 
+    //! Get profiling callback object
+    virtual amd::ProfilingCallback* getProfilingCallback(
+        const device::VirtualDevice *vdev){
+        return waveLimiter_.getProfilingCallback(vdev);
+    }
+
+    //! Get waves per shader array to be used for kernel execution.
+    uint getWavesPerSH(const device::VirtualDevice *vdev) const {
+        return waveLimiter_.getWavesPerSH(vdev);
+    }
+
 private:
     //! Disable copy constructor
     HSAILKernel(const HSAILKernel&);
@@ -968,6 +979,8 @@ private:
         uint    value_;
         Flags(): value_(0) {}
     } flags_;
+
+    WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
 };
 
 /*@}*/} // namespace gpu
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 42105e2004..b688a8653b 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -1791,7 +1791,7 @@ VirtualGPU::submitKernelInternalHSA(
         // Run AQL dispatch in HW
         eventBegin(MainEngine);
         cs()->AqlDispatch(aqlPkt, vmMems(), cal_.memCount_, scratch, scratchOffset,
-            hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
+            hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, hsaKernel.getWavesPerSH(this));
         eventEnd(MainEngine, gpuEvent);
 
         if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) {
diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp b/rocclr/runtime/device/gpu/gpuwavelimiter.cpp
index 7fcc0140bb..c50313a9bb 100644
--- a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp
+++ b/rocclr/runtime/device/gpu/gpuwavelimiter.cpp
@@ -20,24 +20,21 @@ uint WLAlgorithmSmooth::AbandonThresh;
 uint WLAlgorithmSmooth::DscThresh;
 
 WaveLimiter::WaveLimiter(
-        Kernel* owner,
+        WaveLimiterManager* manager,
         uint    seqNum,
         bool    enable,
         bool    enableDump):
-        owner_(owner),
-        dumper_(owner_->name() + "_" + std::to_string(seqNum), enableDump) {
-    auto gpuDev = static_cast<const Device*>(&owner_->dev());
-    auto attrib = gpuDev->getAttribs();
-    auto hwInfo = gpuDev->hwInfo();
-    setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH,
-            attrib.numberOfCUsperShaderArray * hwInfo->simdPerCU_);
+        manager_(manager),
+        dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
+
+    setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
     MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
     WarmUpCount = GPU_WAVE_LIMIT_WARMUP;
     RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
 
     state_ = WARMUP;
     if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
-        traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + owner_->name() +
+        traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() +
             ".txt");
     }
 
@@ -58,8 +55,8 @@ uint WaveLimiter::getWavesPerSH(){
     return waves_ * SIMDPerSH_;
 }
 
-WLAlgorithmSmooth::WLAlgorithmSmooth(Kernel* owner, uint seqNum, bool enable, bool enableDump):
-    WaveLimiter(owner, seqNum, enable, enableDump) {
+WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump):
+    WaveLimiter(manager, seqNum, enable, enableDump) {
     AdaptCount = 2 * MaxWave + 1;
     AbandonThresh = GPU_WAVE_LIMIT_ABANDON;
     DscThresh = GPU_WAVE_LIMIT_DSC_THRESH;
@@ -122,7 +119,7 @@ void WLAlgorithmSmooth::outputTrace() {
         return;
     }
 
-    traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
+    traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
             << " currWaves=" << currWaves_ << " waves=" << waves_
             << " bestWave=" << bestWave_ << '\n';
     output(traceStream_, "\n measure = ", measure_);
@@ -217,8 +214,8 @@ void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
     state_.push_back(state);
 }
 
-WLAlgorithmAvrg::WLAlgorithmAvrg(Kernel* owner, uint seqNum, bool enable, bool enableDump):
-    WaveLimiter(owner, seqNum, enable, enableDump) {
+WLAlgorithmAvrg::WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump):
+    WaveLimiter(manager, seqNum, enable, enableDump) {
 
     measure_.resize(MaxWave + 1);
     clear(measure_);
@@ -234,7 +231,7 @@ void WLAlgorithmAvrg::outputTrace() {
         return;
     }
 
-    traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
+    traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
             << " currWaves=" << currWaves_ << " waves=" << waves_
             << " bestWave=" << bestWave_ << '\n';
     output(traceStream_, "\n measure = ", measure_);
@@ -279,17 +276,12 @@ void WLAlgorithmAvrg::callback(ulong duration) {
     }
 }
 
-WaveLimiterManager::WaveLimiterManager(Kernel* kernel):
+WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH):
         owner_(kernel),
         enable_(false),
         enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
-    auto gpuDev = static_cast<const Device*>(&owner_->dev());
-    auto attrib = gpuDev->getAttribs();
-    auto hwInfo = gpuDev->hwInfo();
-    unsigned simdPerSH = 0;
-    setIfNotDefault(simdPerSH, GPU_WAVE_LIMIT_CU_PER_SH,
-            attrib.numberOfCUsperShaderArray * hwInfo->simdPerCU_);
-    fixed_ = GPU_WAVES_PER_SIMD * simdPerSH;
+    setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH);
+    fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
 }
 
 WaveLimiterManager::~WaveLimiterManager() {
@@ -326,7 +318,7 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
         return loc->second;
     }
 
-    auto limiter = new WLAlgorithmSmooth(owner_, limiters_.size(), enable_,
+    auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_,
             enableDump_);
     if (limiter == NULL) {
         enable_ = false;
@@ -336,26 +328,23 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
     return limiter;
 }
 
-void WaveLimiterManager::enable() {
+void WaveLimiterManager::enable(const bool isCiPlus) {
     if (fixed_ > 0) {
         return;
     }
-    auto gpuDev = static_cast<const Device*>(&owner_->dev());
-    auto hwInfo = gpuDev->hwInfo();
+
     // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
     // Disabled for SI due to bug #10817
-
     if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
         enable_ = GPU_WAVE_LIMIT_ENABLE;
     }
     else {
-        if (gpuDev->settings().ciPlus_) {
+        if (isCiPlus) {
             if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
                 enable_ = true;
             }
             else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
-                //Todo:
-                //fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_;
+                fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH();
             }
         }
     }
diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp b/rocclr/runtime/device/gpu/gpuwavelimiter.hpp
index bc2a5fd7b5..07612fa451 100644
--- a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp
+++ b/rocclr/runtime/device/gpu/gpuwavelimiter.hpp
@@ -16,12 +16,12 @@
 //! \namespace gpu GPU Device Implementation
 namespace gpu {
 
-class Kernel;
+class WaveLimiterManager;
 
 // Adaptively limit the number of waves per SIMD based on kernel execution time
 class WaveLimiter: public amd::ProfilingCallback {
 public:
-    explicit WaveLimiter(Kernel*, uint seqNum, bool enable, bool enableDump);
+    explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
     virtual ~WaveLimiter();
 
     //! Get waves per shader array to be used for kernel execution.
@@ -57,7 +57,7 @@ protected:
     uint bestWave_;      // Optimal waves per SIMD
     uint countAll_;      // Number of kernel executions
     StateKind state_;
-    Kernel *owner_;
+    WaveLimiterManager* manager_;
     DataDumper dumper_;
     std::ofstream traceStream_;
     uint currWaves_;     // Current waves per SIMD
@@ -88,7 +88,7 @@ protected:
 
 class WLAlgorithmSmooth: public WaveLimiter {
 public:
-    explicit WLAlgorithmSmooth(Kernel* owner, uint seqNum, bool enable, bool enableDump);
+    explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
     virtual ~WLAlgorithmSmooth();
 private:
     std::vector<uint64_t> reference_;
@@ -117,7 +117,7 @@ private:
 
 class WLAlgorithmAvrg: public WaveLimiter {
 public:
-    explicit WLAlgorithmAvrg(Kernel* owner, uint seqNum, bool enable, bool enableDump);
+    explicit WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
     virtual ~WLAlgorithmAvrg();
 private:
     //! Call back from Event::recordProfilingInfo to get execution time.
@@ -130,7 +130,7 @@ private:
 // Create wave limiter for each virtual device for a kernel and manages the wave limiters.
 class WaveLimiterManager {
 public:
-    explicit WaveLimiterManager(Kernel* owner);
+    explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH);
     virtual ~WaveLimiterManager();
 
     //! Get waves per shader array for a specific virtual device.
@@ -140,9 +140,17 @@ public:
     amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *);
 
     //! Enable wave limiter manager by kernel metadata and flags.
-    void enable();
+    void enable(const bool isCiPlus);
+
+    //! Returns the kernel name
+    const std::string& name() const { return owner_->name(); }
+
+    //! Get SimdPerSH.
+    uint getSimdPerSH() const {return simdPerSH_;}
+
 private:
-    Kernel *owner_;                // The kernel which owns this object
+    device::Kernel *owner_;        // The kernel which owns this object
+    uint simdPerSH_;               // Simd Per SH
     std::unordered_map<const device::VirtualDevice *,
         WaveLimiter*> limiters_;   // Maps virtual device to wave limiter
     bool enable_;                  // Whether the adaptation is enabled