From 19ecdf3fd126c0ebd8d9e253155cb0434654d500 Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 15 Oct 2015 10:21:27 -0400
Subject: [PATCH] P4 to Git Change 1200206 by jatang@jatang-opencl-hsa-stg1 on
2015/10/15 10:08:51
SWDEV-56468 - Support wave limiter in HSAIL path.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#302 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#120 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#386 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.cpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.hpp#6 edit
---
rocclr/runtime/device/gpu/gpukernel.cpp | 15 +++++-
rocclr/runtime/device/gpu/gpukernel.hpp | 13 +++++
rocclr/runtime/device/gpu/gpuvirtual.cpp | 2 +-
rocclr/runtime/device/gpu/gpuwavelimiter.cpp | 51 ++++++++------------
rocclr/runtime/device/gpu/gpuwavelimiter.hpp | 24 ++++++---
5 files changed, 63 insertions(+), 42 deletions(-)
diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index b970cedada..d106097777 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -826,7 +826,7 @@ Kernel::create(
// Wave limiter needs to be initialized after kernel metadata is parsed
// Since it depends on it.
- waveLimiter_.enable();
+ waveLimiter_.enable(dev().settings().ciPlus_);
if (result) {
buildError_ = CL_SUCCESS;
@@ -844,7 +844,7 @@ Kernel::Kernel(
const Program& prog,
const InitData* initData)
: NullKernel(name, gpuDev, prog)
- , waveLimiter_(this)
+ , waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_)
{
hwPrivateSize_ = 0;
if (NULL != initData) {
@@ -3371,6 +3371,7 @@ HSAILKernel::HSAILKernel(std::string name,
, codeSize_(0)
, hwMetaData_(NULL)
, extraArgumentsNum_(extraArgsNum)
+ , waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_)
{
hsa_ = true;
}
@@ -3517,6 +3518,16 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
}
index_ = md.kernel_index;
+ size_t sizeOfWavesPerSimdHint;
+ error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
+ RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(),
+ &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint);
+ if (error != ACL_SUCCESS) {
+ return false;
+ }
+
+ waveLimiter_.enable(dev().settings().ciPlus_);
+
return true;
}
diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp
index 05db79aad9..a516639a85 100644
--- a/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -918,6 +918,17 @@ public:
//! Returns kernel's extra argument count
uint extraArgumentsNum() const { return extraArgumentsNum_; }
+ //! Get profiling callback object
+ virtual amd::ProfilingCallback* getProfilingCallback(
+ const device::VirtualDevice *vdev){
+ return waveLimiter_.getProfilingCallback(vdev);
+ }
+
+ //! Get waves per shader array to be used for kernel execution.
+ uint getWavesPerSH(const device::VirtualDevice *vdev) const {
+ return waveLimiter_.getWavesPerSH(vdev);
+ }
+
private:
//! Disable copy constructor
HSAILKernel(const HSAILKernel&);
@@ -968,6 +979,8 @@ private:
uint value_;
Flags(): value_(0) {}
} flags_;
+
+ WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
};
/*@}*/} // namespace gpu
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 42105e2004..b688a8653b 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -1791,7 +1791,7 @@ VirtualGPU::submitKernelInternalHSA(
// Run AQL dispatch in HW
eventBegin(MainEngine);
cs()->AqlDispatch(aqlPkt, vmMems(), cal_.memCount_, scratch, scratchOffset,
- hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
+ hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, hsaKernel.getWavesPerSH(this));
eventEnd(MainEngine, gpuEvent);
if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) {
diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp b/rocclr/runtime/device/gpu/gpuwavelimiter.cpp
index 7fcc0140bb..c50313a9bb 100644
--- a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp
+++ b/rocclr/runtime/device/gpu/gpuwavelimiter.cpp
@@ -20,24 +20,21 @@ uint WLAlgorithmSmooth::AbandonThresh;
uint WLAlgorithmSmooth::DscThresh;
WaveLimiter::WaveLimiter(
- Kernel* owner,
+ WaveLimiterManager* manager,
uint seqNum,
bool enable,
bool enableDump):
- owner_(owner),
- dumper_(owner_->name() + "_" + std::to_string(seqNum), enableDump) {
- auto gpuDev = static_cast(&owner_->dev());
- auto attrib = gpuDev->getAttribs();
- auto hwInfo = gpuDev->hwInfo();
- setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH,
- attrib.numberOfCUsperShaderArray * hwInfo->simdPerCU_);
+ manager_(manager),
+ dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
+
+ setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
WarmUpCount = GPU_WAVE_LIMIT_WARMUP;
RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
state_ = WARMUP;
if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
- traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + owner_->name() +
+ traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() +
".txt");
}
@@ -58,8 +55,8 @@ uint WaveLimiter::getWavesPerSH(){
return waves_ * SIMDPerSH_;
}
-WLAlgorithmSmooth::WLAlgorithmSmooth(Kernel* owner, uint seqNum, bool enable, bool enableDump):
- WaveLimiter(owner, seqNum, enable, enableDump) {
+WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump):
+ WaveLimiter(manager, seqNum, enable, enableDump) {
AdaptCount = 2 * MaxWave + 1;
AbandonThresh = GPU_WAVE_LIMIT_ABANDON;
DscThresh = GPU_WAVE_LIMIT_DSC_THRESH;
@@ -122,7 +119,7 @@ void WLAlgorithmSmooth::outputTrace() {
return;
}
- traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
+ traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
<< " currWaves=" << currWaves_ << " waves=" << waves_
<< " bestWave=" << bestWave_ << '\n';
output(traceStream_, "\n measure = ", measure_);
@@ -217,8 +214,8 @@ void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
state_.push_back(state);
}
-WLAlgorithmAvrg::WLAlgorithmAvrg(Kernel* owner, uint seqNum, bool enable, bool enableDump):
- WaveLimiter(owner, seqNum, enable, enableDump) {
+WLAlgorithmAvrg::WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump):
+ WaveLimiter(manager, seqNum, enable, enableDump) {
measure_.resize(MaxWave + 1);
clear(measure_);
@@ -234,7 +231,7 @@ void WLAlgorithmAvrg::outputTrace() {
return;
}
- traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
+ traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
<< " currWaves=" << currWaves_ << " waves=" << waves_
<< " bestWave=" << bestWave_ << '\n';
output(traceStream_, "\n measure = ", measure_);
@@ -279,17 +276,12 @@ void WLAlgorithmAvrg::callback(ulong duration) {
}
}
-WaveLimiterManager::WaveLimiterManager(Kernel* kernel):
+WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH):
owner_(kernel),
enable_(false),
enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
- auto gpuDev = static_cast(&owner_->dev());
- auto attrib = gpuDev->getAttribs();
- auto hwInfo = gpuDev->hwInfo();
- unsigned simdPerSH = 0;
- setIfNotDefault(simdPerSH, GPU_WAVE_LIMIT_CU_PER_SH,
- attrib.numberOfCUsperShaderArray * hwInfo->simdPerCU_);
- fixed_ = GPU_WAVES_PER_SIMD * simdPerSH;
+ setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH);
+ fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
}
WaveLimiterManager::~WaveLimiterManager() {
@@ -326,7 +318,7 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
return loc->second;
}
- auto limiter = new WLAlgorithmSmooth(owner_, limiters_.size(), enable_,
+ auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_,
enableDump_);
if (limiter == NULL) {
enable_ = false;
@@ -336,26 +328,23 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
return limiter;
}
-void WaveLimiterManager::enable() {
+void WaveLimiterManager::enable(const bool isCiPlus) {
if (fixed_ > 0) {
return;
}
- auto gpuDev = static_cast(&owner_->dev());
- auto hwInfo = gpuDev->hwInfo();
+
// Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
// Disabled for SI due to bug #10817
-
if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
enable_ = GPU_WAVE_LIMIT_ENABLE;
}
else {
- if (gpuDev->settings().ciPlus_) {
+ if (isCiPlus) {
if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
enable_ = true;
}
else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
- //Todo:
- //fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_;
+ fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH();
}
}
}
diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp b/rocclr/runtime/device/gpu/gpuwavelimiter.hpp
index bc2a5fd7b5..07612fa451 100644
--- a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp
+++ b/rocclr/runtime/device/gpu/gpuwavelimiter.hpp
@@ -16,12 +16,12 @@
//! \namespace gpu GPU Device Implementation
namespace gpu {
-class Kernel;
+class WaveLimiterManager;
// Adaptively limit the number of waves per SIMD based on kernel execution time
class WaveLimiter: public amd::ProfilingCallback {
public:
- explicit WaveLimiter(Kernel*, uint seqNum, bool enable, bool enableDump);
+ explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
virtual ~WaveLimiter();
//! Get waves per shader array to be used for kernel execution.
@@ -57,7 +57,7 @@ protected:
uint bestWave_; // Optimal waves per SIMD
uint countAll_; // Number of kernel executions
StateKind state_;
- Kernel *owner_;
+ WaveLimiterManager* manager_;
DataDumper dumper_;
std::ofstream traceStream_;
uint currWaves_; // Current waves per SIMD
@@ -88,7 +88,7 @@ protected:
class WLAlgorithmSmooth: public WaveLimiter {
public:
- explicit WLAlgorithmSmooth(Kernel* owner, uint seqNum, bool enable, bool enableDump);
+ explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
virtual ~WLAlgorithmSmooth();
private:
std::vector reference_;
@@ -117,7 +117,7 @@ private:
class WLAlgorithmAvrg: public WaveLimiter {
public:
- explicit WLAlgorithmAvrg(Kernel* owner, uint seqNum, bool enable, bool enableDump);
+ explicit WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
virtual ~WLAlgorithmAvrg();
private:
//! Call back from Event::recordProfilingInfo to get execution time.
@@ -130,7 +130,7 @@ private:
// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
class WaveLimiterManager {
public:
- explicit WaveLimiterManager(Kernel* owner);
+ explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH);
virtual ~WaveLimiterManager();
//! Get waves per shader array for a specific virtual device.
@@ -140,9 +140,17 @@ public:
amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *);
//! Enable wave limiter manager by kernel metadata and flags.
- void enable();
+ void enable(const bool isCiPlus);
+
+ //! Returns the kernel name
+ const std::string& name() const { return owner_->name(); }
+
+ //! Get SimdPerSH.
+ uint getSimdPerSH() const {return simdPerSH_;}
+
private:
- Kernel *owner_; // The kernel which owns this object
+ device::Kernel *owner_; // The kernel which owns this object
+ uint simdPerSH_; // Simd Per SH
std::unordered_map limiters_; // Maps virtual device to wave limiter
bool enable_; // Whether the adaptation is enabled