P4 to Git Change 1200206 by jatang@jatang-opencl-hsa-stg1 on 2015/10/15 10:08:51

SWDEV-56468 - Support wave limiter in HSAIL path.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#302 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#120 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#386 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.cpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.hpp#6 edit
This commit is contained in:
foreman
2015-10-15 10:21:27 -04:00
szülő 4996170c79
commit 19ecdf3fd1
5 fájl változott, egészen pontosan 63 új sor hozzáadva és 42 régi sor törölve
@@ -826,7 +826,7 @@ Kernel::create(
// Wave limiter needs to be initialized after kernel metadata is parsed
// Since it depends on it.
waveLimiter_.enable();
waveLimiter_.enable(dev().settings().ciPlus_);
if (result) {
buildError_ = CL_SUCCESS;
@@ -844,7 +844,7 @@ Kernel::Kernel(
const Program& prog,
const InitData* initData)
: NullKernel(name, gpuDev, prog)
, waveLimiter_(this)
, waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_)
{
hwPrivateSize_ = 0;
if (NULL != initData) {
@@ -3371,6 +3371,7 @@ HSAILKernel::HSAILKernel(std::string name,
, codeSize_(0)
, hwMetaData_(NULL)
, extraArgumentsNum_(extraArgsNum)
, waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_)
{
hsa_ = true;
}
@@ -3517,6 +3518,16 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
}
index_ = md.kernel_index;
size_t sizeOfWavesPerSimdHint;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(),
&workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint);
if (error != ACL_SUCCESS) {
return false;
}
waveLimiter_.enable(dev().settings().ciPlus_);
return true;
}
@@ -918,6 +918,17 @@ public:
//! Returns kernel's extra argument count
uint extraArgumentsNum() const { return extraArgumentsNum_; }
//! Get profiling callback object
virtual amd::ProfilingCallback* getProfilingCallback(
const device::VirtualDevice *vdev){
return waveLimiter_.getProfilingCallback(vdev);
}
//! Get waves per shader array to be used for kernel execution.
uint getWavesPerSH(const device::VirtualDevice *vdev) const {
return waveLimiter_.getWavesPerSH(vdev);
}
private:
//! Disable copy constructor
HSAILKernel(const HSAILKernel&);
@@ -968,6 +979,8 @@ private:
uint value_;
Flags(): value_(0) {}
} flags_;
WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
};
/*@}*/} // namespace gpu
@@ -1791,7 +1791,7 @@ VirtualGPU::submitKernelInternalHSA(
// Run AQL dispatch in HW
eventBegin(MainEngine);
cs()->AqlDispatch(aqlPkt, vmMems(), cal_.memCount_, scratch, scratchOffset,
hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, hsaKernel.getWavesPerSH(this));
eventEnd(MainEngine, gpuEvent);
if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) {
@@ -20,24 +20,21 @@ uint WLAlgorithmSmooth::AbandonThresh;
uint WLAlgorithmSmooth::DscThresh;
WaveLimiter::WaveLimiter(
Kernel* owner,
WaveLimiterManager* manager,
uint seqNum,
bool enable,
bool enableDump):
owner_(owner),
dumper_(owner_->name() + "_" + std::to_string(seqNum), enableDump) {
auto gpuDev = static_cast<const Device*>(&owner_->dev());
auto attrib = gpuDev->getAttribs();
auto hwInfo = gpuDev->hwInfo();
setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH,
attrib.numberOfCUsperShaderArray * hwInfo->simdPerCU_);
manager_(manager),
dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
WarmUpCount = GPU_WAVE_LIMIT_WARMUP;
RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
state_ = WARMUP;
if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + owner_->name() +
traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() +
".txt");
}
@@ -58,8 +55,8 @@ uint WaveLimiter::getWavesPerSH(){
return waves_ * SIMDPerSH_;
}
WLAlgorithmSmooth::WLAlgorithmSmooth(Kernel* owner, uint seqNum, bool enable, bool enableDump):
WaveLimiter(owner, seqNum, enable, enableDump) {
WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump):
WaveLimiter(manager, seqNum, enable, enableDump) {
AdaptCount = 2 * MaxWave + 1;
AbandonThresh = GPU_WAVE_LIMIT_ABANDON;
DscThresh = GPU_WAVE_LIMIT_DSC_THRESH;
@@ -122,7 +119,7 @@ void WLAlgorithmSmooth::outputTrace() {
return;
}
traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
<< " currWaves=" << currWaves_ << " waves=" << waves_
<< " bestWave=" << bestWave_ << '\n';
output(traceStream_, "\n measure = ", measure_);
@@ -217,8 +214,8 @@ void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
state_.push_back(state);
}
WLAlgorithmAvrg::WLAlgorithmAvrg(Kernel* owner, uint seqNum, bool enable, bool enableDump):
WaveLimiter(owner, seqNum, enable, enableDump) {
WLAlgorithmAvrg::WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump):
WaveLimiter(manager, seqNum, enable, enableDump) {
measure_.resize(MaxWave + 1);
clear(measure_);
@@ -234,7 +231,7 @@ void WLAlgorithmAvrg::outputTrace() {
return;
}
traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
<< " currWaves=" << currWaves_ << " waves=" << waves_
<< " bestWave=" << bestWave_ << '\n';
output(traceStream_, "\n measure = ", measure_);
@@ -279,17 +276,12 @@ void WLAlgorithmAvrg::callback(ulong duration) {
}
}
WaveLimiterManager::WaveLimiterManager(Kernel* kernel):
WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH):
owner_(kernel),
enable_(false),
enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
auto gpuDev = static_cast<const Device*>(&owner_->dev());
auto attrib = gpuDev->getAttribs();
auto hwInfo = gpuDev->hwInfo();
unsigned simdPerSH = 0;
setIfNotDefault(simdPerSH, GPU_WAVE_LIMIT_CU_PER_SH,
attrib.numberOfCUsperShaderArray * hwInfo->simdPerCU_);
fixed_ = GPU_WAVES_PER_SIMD * simdPerSH;
setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH);
fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
}
WaveLimiterManager::~WaveLimiterManager() {
@@ -326,7 +318,7 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
return loc->second;
}
auto limiter = new WLAlgorithmSmooth(owner_, limiters_.size(), enable_,
auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_,
enableDump_);
if (limiter == NULL) {
enable_ = false;
@@ -336,26 +328,23 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
return limiter;
}
void WaveLimiterManager::enable() {
void WaveLimiterManager::enable(const bool isCiPlus) {
if (fixed_ > 0) {
return;
}
auto gpuDev = static_cast<const Device*>(&owner_->dev());
auto hwInfo = gpuDev->hwInfo();
// Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
// Disabled for SI due to bug #10817
if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
enable_ = GPU_WAVE_LIMIT_ENABLE;
}
else {
if (gpuDev->settings().ciPlus_) {
if (isCiPlus) {
if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
enable_ = true;
}
else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
//Todo:
//fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_;
fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH();
}
}
}
@@ -16,12 +16,12 @@
//! \namespace gpu GPU Device Implementation
namespace gpu {
class Kernel;
class WaveLimiterManager;
// Adaptively limit the number of waves per SIMD based on kernel execution time
class WaveLimiter: public amd::ProfilingCallback {
public:
explicit WaveLimiter(Kernel*, uint seqNum, bool enable, bool enableDump);
explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
virtual ~WaveLimiter();
//! Get waves per shader array to be used for kernel execution.
@@ -57,7 +57,7 @@ protected:
uint bestWave_; // Optimal waves per SIMD
uint countAll_; // Number of kernel executions
StateKind state_;
Kernel *owner_;
WaveLimiterManager* manager_;
DataDumper dumper_;
std::ofstream traceStream_;
uint currWaves_; // Current waves per SIMD
@@ -88,7 +88,7 @@ protected:
class WLAlgorithmSmooth: public WaveLimiter {
public:
explicit WLAlgorithmSmooth(Kernel* owner, uint seqNum, bool enable, bool enableDump);
explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
virtual ~WLAlgorithmSmooth();
private:
std::vector<uint64_t> reference_;
@@ -117,7 +117,7 @@ private:
class WLAlgorithmAvrg: public WaveLimiter {
public:
explicit WLAlgorithmAvrg(Kernel* owner, uint seqNum, bool enable, bool enableDump);
explicit WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
virtual ~WLAlgorithmAvrg();
private:
//! Call back from Event::recordProfilingInfo to get execution time.
@@ -130,7 +130,7 @@ private:
// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
class WaveLimiterManager {
public:
explicit WaveLimiterManager(Kernel* owner);
explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH);
virtual ~WaveLimiterManager();
//! Get waves per shader array for a specific virtual device.
@@ -140,9 +140,17 @@ public:
amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *);
//! Enable wave limiter manager by kernel metadata and flags.
void enable();
void enable(const bool isCiPlus);
//! Returns the kernel name
const std::string& name() const { return owner_->name(); }
//! Get SimdPerSH.
uint getSimdPerSH() const {return simdPerSH_;}
private:
Kernel *owner_; // The kernel which owns this object
device::Kernel *owner_; // The kernel which owns this object
uint simdPerSH_; // Simd Per SH
std::unordered_map<const device::VirtualDevice *,
WaveLimiter*> limiters_; // Maps virtual device to wave limiter
bool enable_; // Whether the adaptation is enabled