P4 to Git Change 1200206 by jatang@jatang-opencl-hsa-stg1 on 2015/10/15 10:08:51
SWDEV-56468 - Support wave limiter in HSAIL path. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#302 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#120 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#386 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.cpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.hpp#6 edit
This commit is contained in:
@@ -826,7 +826,7 @@ Kernel::create(
|
||||
|
||||
// Wave limiter needs to be initialized after kernel metadata is parsed
|
||||
// Since it depends on it.
|
||||
waveLimiter_.enable();
|
||||
waveLimiter_.enable(dev().settings().ciPlus_);
|
||||
|
||||
if (result) {
|
||||
buildError_ = CL_SUCCESS;
|
||||
@@ -844,7 +844,7 @@ Kernel::Kernel(
|
||||
const Program& prog,
|
||||
const InitData* initData)
|
||||
: NullKernel(name, gpuDev, prog)
|
||||
, waveLimiter_(this)
|
||||
, waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_)
|
||||
{
|
||||
hwPrivateSize_ = 0;
|
||||
if (NULL != initData) {
|
||||
@@ -3371,6 +3371,7 @@ HSAILKernel::HSAILKernel(std::string name,
|
||||
, codeSize_(0)
|
||||
, hwMetaData_(NULL)
|
||||
, extraArgumentsNum_(extraArgsNum)
|
||||
, waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_)
|
||||
{
|
||||
hsa_ = true;
|
||||
}
|
||||
@@ -3517,6 +3518,16 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
|
||||
}
|
||||
index_ = md.kernel_index;
|
||||
|
||||
size_t sizeOfWavesPerSimdHint;
|
||||
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
||||
RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(),
|
||||
&workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint);
|
||||
if (error != ACL_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
waveLimiter_.enable(dev().settings().ciPlus_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -918,6 +918,17 @@ public:
|
||||
//! Returns kernel's extra argument count
|
||||
uint extraArgumentsNum() const { return extraArgumentsNum_; }
|
||||
|
||||
//! Get profiling callback object
|
||||
virtual amd::ProfilingCallback* getProfilingCallback(
|
||||
const device::VirtualDevice *vdev){
|
||||
return waveLimiter_.getProfilingCallback(vdev);
|
||||
}
|
||||
|
||||
//! Get waves per shader array to be used for kernel execution.
|
||||
uint getWavesPerSH(const device::VirtualDevice *vdev) const {
|
||||
return waveLimiter_.getWavesPerSH(vdev);
|
||||
}
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
HSAILKernel(const HSAILKernel&);
|
||||
@@ -968,6 +979,8 @@ private:
|
||||
uint value_;
|
||||
Flags(): value_(0) {}
|
||||
} flags_;
|
||||
|
||||
WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
|
||||
};
|
||||
|
||||
/*@}*/} // namespace gpu
|
||||
|
||||
@@ -1791,7 +1791,7 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
// Run AQL dispatch in HW
|
||||
eventBegin(MainEngine);
|
||||
cs()->AqlDispatch(aqlPkt, vmMems(), cal_.memCount_, scratch, scratchOffset,
|
||||
hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
|
||||
hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, hsaKernel.getWavesPerSH(this));
|
||||
eventEnd(MainEngine, gpuEvent);
|
||||
|
||||
if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) {
|
||||
|
||||
@@ -20,24 +20,21 @@ uint WLAlgorithmSmooth::AbandonThresh;
|
||||
uint WLAlgorithmSmooth::DscThresh;
|
||||
|
||||
WaveLimiter::WaveLimiter(
|
||||
Kernel* owner,
|
||||
WaveLimiterManager* manager,
|
||||
uint seqNum,
|
||||
bool enable,
|
||||
bool enableDump):
|
||||
owner_(owner),
|
||||
dumper_(owner_->name() + "_" + std::to_string(seqNum), enableDump) {
|
||||
auto gpuDev = static_cast<const Device*>(&owner_->dev());
|
||||
auto attrib = gpuDev->getAttribs();
|
||||
auto hwInfo = gpuDev->hwInfo();
|
||||
setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH,
|
||||
attrib.numberOfCUsperShaderArray * hwInfo->simdPerCU_);
|
||||
manager_(manager),
|
||||
dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
|
||||
|
||||
setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
|
||||
MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
|
||||
WarmUpCount = GPU_WAVE_LIMIT_WARMUP;
|
||||
RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
|
||||
|
||||
state_ = WARMUP;
|
||||
if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
|
||||
traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + owner_->name() +
|
||||
traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() +
|
||||
".txt");
|
||||
}
|
||||
|
||||
@@ -58,8 +55,8 @@ uint WaveLimiter::getWavesPerSH(){
|
||||
return waves_ * SIMDPerSH_;
|
||||
}
|
||||
|
||||
WLAlgorithmSmooth::WLAlgorithmSmooth(Kernel* owner, uint seqNum, bool enable, bool enableDump):
|
||||
WaveLimiter(owner, seqNum, enable, enableDump) {
|
||||
WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump):
|
||||
WaveLimiter(manager, seqNum, enable, enableDump) {
|
||||
AdaptCount = 2 * MaxWave + 1;
|
||||
AbandonThresh = GPU_WAVE_LIMIT_ABANDON;
|
||||
DscThresh = GPU_WAVE_LIMIT_DSC_THRESH;
|
||||
@@ -122,7 +119,7 @@ void WLAlgorithmSmooth::outputTrace() {
|
||||
return;
|
||||
}
|
||||
|
||||
traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
|
||||
traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
|
||||
<< " currWaves=" << currWaves_ << " waves=" << waves_
|
||||
<< " bestWave=" << bestWave_ << '\n';
|
||||
output(traceStream_, "\n measure = ", measure_);
|
||||
@@ -217,8 +214,8 @@ void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
|
||||
state_.push_back(state);
|
||||
}
|
||||
|
||||
WLAlgorithmAvrg::WLAlgorithmAvrg(Kernel* owner, uint seqNum, bool enable, bool enableDump):
|
||||
WaveLimiter(owner, seqNum, enable, enableDump) {
|
||||
WLAlgorithmAvrg::WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump):
|
||||
WaveLimiter(manager, seqNum, enable, enableDump) {
|
||||
|
||||
measure_.resize(MaxWave + 1);
|
||||
clear(measure_);
|
||||
@@ -234,7 +231,7 @@ void WLAlgorithmAvrg::outputTrace() {
|
||||
return;
|
||||
}
|
||||
|
||||
traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
|
||||
traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
|
||||
<< " currWaves=" << currWaves_ << " waves=" << waves_
|
||||
<< " bestWave=" << bestWave_ << '\n';
|
||||
output(traceStream_, "\n measure = ", measure_);
|
||||
@@ -279,17 +276,12 @@ void WLAlgorithmAvrg::callback(ulong duration) {
|
||||
}
|
||||
}
|
||||
|
||||
WaveLimiterManager::WaveLimiterManager(Kernel* kernel):
|
||||
WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH):
|
||||
owner_(kernel),
|
||||
enable_(false),
|
||||
enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
|
||||
auto gpuDev = static_cast<const Device*>(&owner_->dev());
|
||||
auto attrib = gpuDev->getAttribs();
|
||||
auto hwInfo = gpuDev->hwInfo();
|
||||
unsigned simdPerSH = 0;
|
||||
setIfNotDefault(simdPerSH, GPU_WAVE_LIMIT_CU_PER_SH,
|
||||
attrib.numberOfCUsperShaderArray * hwInfo->simdPerCU_);
|
||||
fixed_ = GPU_WAVES_PER_SIMD * simdPerSH;
|
||||
setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH);
|
||||
fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
|
||||
}
|
||||
|
||||
WaveLimiterManager::~WaveLimiterManager() {
|
||||
@@ -326,7 +318,7 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
|
||||
return loc->second;
|
||||
}
|
||||
|
||||
auto limiter = new WLAlgorithmSmooth(owner_, limiters_.size(), enable_,
|
||||
auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_,
|
||||
enableDump_);
|
||||
if (limiter == NULL) {
|
||||
enable_ = false;
|
||||
@@ -336,26 +328,23 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
|
||||
return limiter;
|
||||
}
|
||||
|
||||
void WaveLimiterManager::enable() {
|
||||
void WaveLimiterManager::enable(const bool isCiPlus) {
|
||||
if (fixed_ > 0) {
|
||||
return;
|
||||
}
|
||||
auto gpuDev = static_cast<const Device*>(&owner_->dev());
|
||||
auto hwInfo = gpuDev->hwInfo();
|
||||
|
||||
// Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
|
||||
// Disabled for SI due to bug #10817
|
||||
|
||||
if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
|
||||
enable_ = GPU_WAVE_LIMIT_ENABLE;
|
||||
}
|
||||
else {
|
||||
if (gpuDev->settings().ciPlus_) {
|
||||
if (isCiPlus) {
|
||||
if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
|
||||
enable_ = true;
|
||||
}
|
||||
else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
|
||||
//Todo:
|
||||
//fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_;
|
||||
fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,12 +16,12 @@
|
||||
//! \namespace gpu GPU Device Implementation
|
||||
namespace gpu {
|
||||
|
||||
class Kernel;
|
||||
class WaveLimiterManager;
|
||||
|
||||
// Adaptively limit the number of waves per SIMD based on kernel execution time
|
||||
class WaveLimiter: public amd::ProfilingCallback {
|
||||
public:
|
||||
explicit WaveLimiter(Kernel*, uint seqNum, bool enable, bool enableDump);
|
||||
explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
|
||||
virtual ~WaveLimiter();
|
||||
|
||||
//! Get waves per shader array to be used for kernel execution.
|
||||
@@ -57,7 +57,7 @@ protected:
|
||||
uint bestWave_; // Optimal waves per SIMD
|
||||
uint countAll_; // Number of kernel executions
|
||||
StateKind state_;
|
||||
Kernel *owner_;
|
||||
WaveLimiterManager* manager_;
|
||||
DataDumper dumper_;
|
||||
std::ofstream traceStream_;
|
||||
uint currWaves_; // Current waves per SIMD
|
||||
@@ -88,7 +88,7 @@ protected:
|
||||
|
||||
class WLAlgorithmSmooth: public WaveLimiter {
|
||||
public:
|
||||
explicit WLAlgorithmSmooth(Kernel* owner, uint seqNum, bool enable, bool enableDump);
|
||||
explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
|
||||
virtual ~WLAlgorithmSmooth();
|
||||
private:
|
||||
std::vector<uint64_t> reference_;
|
||||
@@ -117,7 +117,7 @@ private:
|
||||
|
||||
class WLAlgorithmAvrg: public WaveLimiter {
|
||||
public:
|
||||
explicit WLAlgorithmAvrg(Kernel* owner, uint seqNum, bool enable, bool enableDump);
|
||||
explicit WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
|
||||
virtual ~WLAlgorithmAvrg();
|
||||
private:
|
||||
//! Call back from Event::recordProfilingInfo to get execution time.
|
||||
@@ -130,7 +130,7 @@ private:
|
||||
// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
|
||||
class WaveLimiterManager {
|
||||
public:
|
||||
explicit WaveLimiterManager(Kernel* owner);
|
||||
explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH);
|
||||
virtual ~WaveLimiterManager();
|
||||
|
||||
//! Get waves per shader array for a specific virtual device.
|
||||
@@ -140,9 +140,17 @@ public:
|
||||
amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *);
|
||||
|
||||
//! Enable wave limiter manager by kernel metadata and flags.
|
||||
void enable();
|
||||
void enable(const bool isCiPlus);
|
||||
|
||||
//! Returns the kernel name
|
||||
const std::string& name() const { return owner_->name(); }
|
||||
|
||||
//! Get SimdPerSH.
|
||||
uint getSimdPerSH() const {return simdPerSH_;}
|
||||
|
||||
private:
|
||||
Kernel *owner_; // The kernel which owns this object
|
||||
device::Kernel *owner_; // The kernel which owns this object
|
||||
uint simdPerSH_; // Simd Per SH
|
||||
std::unordered_map<const device::VirtualDevice *,
|
||||
WaveLimiter*> limiters_; // Maps virtual device to wave limiter
|
||||
bool enable_; // Whether the adaptation is enabled
|
||||
|
||||
Reference in New Issue
Block a user