diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp index b970cedada..d106097777 100644 --- a/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/rocclr/runtime/device/gpu/gpukernel.cpp @@ -826,7 +826,7 @@ Kernel::create( // Wave limiter needs to be initialized after kernel metadata is parsed // Since it depends on it. - waveLimiter_.enable(); + waveLimiter_.enable(dev().settings().ciPlus_); if (result) { buildError_ = CL_SUCCESS; @@ -844,7 +844,7 @@ Kernel::Kernel( const Program& prog, const InitData* initData) : NullKernel(name, gpuDev, prog) - , waveLimiter_(this) + , waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_) { hwPrivateSize_ = 0; if (NULL != initData) { @@ -3371,6 +3371,7 @@ HSAILKernel::HSAILKernel(std::string name, , codeSize_(0) , hwMetaData_(NULL) , extraArgumentsNum_(extraArgsNum) + , waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_) { hsa_ = true; } @@ -3517,6 +3518,16 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize) } index_ = md.kernel_index; + size_t sizeOfWavesPerSimdHint; + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), + RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(), + &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint); + if (error != ACL_SUCCESS) { + return false; + } + + waveLimiter_.enable(dev().settings().ciPlus_); + return true; } diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp index 05db79aad9..a516639a85 100644 --- a/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/rocclr/runtime/device/gpu/gpukernel.hpp @@ -918,6 +918,17 @@ public: //! Returns kernel's extra argument count uint extraArgumentsNum() const { return extraArgumentsNum_; } + //! Get profiling callback object + virtual amd::ProfilingCallback* getProfilingCallback( + const device::VirtualDevice *vdev){ + return waveLimiter_.getProfilingCallback(vdev); + } + + //! Get waves per shader array to be used for kernel execution. + uint getWavesPerSH(const device::VirtualDevice *vdev) const { + return waveLimiter_.getWavesPerSH(vdev); + } + private: //! Disable copy constructor HSAILKernel(const HSAILKernel&); @@ -968,6 +979,8 @@ private: uint value_; Flags(): value_(0) {} } flags_; + + WaveLimiterManager waveLimiter_; //!< adaptively control number of waves }; /*@}*/} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp index 42105e2004..b688a8653b 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -1791,7 +1791,7 @@ VirtualGPU::submitKernelInternalHSA( // Run AQL dispatch in HW eventBegin(MainEngine); cs()->AqlDispatch(aqlPkt, vmMems(), cal_.memCount_, scratch, scratchOffset, - hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo); + hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, hsaKernel.getWavesPerSH(this)); eventEnd(MainEngine, gpuEvent); if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) { diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp b/rocclr/runtime/device/gpu/gpuwavelimiter.cpp index 7fcc0140bb..c50313a9bb 100644 --- a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp +++ b/rocclr/runtime/device/gpu/gpuwavelimiter.cpp @@ -20,24 +20,21 @@ uint WLAlgorithmSmooth::AbandonThresh; uint WLAlgorithmSmooth::DscThresh; WaveLimiter::WaveLimiter( - Kernel* owner, + WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump): - owner_(owner), - dumper_(owner_->name() + "_" + std::to_string(seqNum), enableDump) { - auto gpuDev = static_cast(&owner_->dev()); - auto attrib = gpuDev->getAttribs(); - auto hwInfo = gpuDev->hwInfo(); - setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, - attrib.numberOfCUsperShaderArray * hwInfo->simdPerCU_); + manager_(manager), + dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) { + + setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH()); MaxWave = GPU_WAVE_LIMIT_MAX_WAVE; WarmUpCount = GPU_WAVE_LIMIT_WARMUP; RunCount = GPU_WAVE_LIMIT_RUN * MaxWave; state_ = WARMUP; if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) { - traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + owner_->name() + + traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + ".txt"); } @@ -58,8 +55,8 @@ uint WaveLimiter::getWavesPerSH(){ return waves_ * SIMDPerSH_; } -WLAlgorithmSmooth::WLAlgorithmSmooth(Kernel* owner, uint seqNum, bool enable, bool enableDump): - WaveLimiter(owner, seqNum, enable, enableDump) { +WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump): + WaveLimiter(manager, seqNum, enable, enableDump) { AdaptCount = 2 * MaxWave + 1; AbandonThresh = GPU_WAVE_LIMIT_ABANDON; DscThresh = GPU_WAVE_LIMIT_DSC_THRESH; @@ -122,7 +119,7 @@ void WLAlgorithmSmooth::outputTrace() { return; } - traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_ + traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ << " currWaves=" << currWaves_ << " waves=" << waves_ << " bestWave=" << bestWave_ << '\n'; output(traceStream_, "\n measure = ", measure_); @@ -217,8 +214,8 @@ void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) { state_.push_back(state); } -WLAlgorithmAvrg::WLAlgorithmAvrg(Kernel* owner, uint seqNum, bool enable, bool enableDump): - WaveLimiter(owner, seqNum, enable, enableDump) { +WLAlgorithmAvrg::WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump): + WaveLimiter(manager, seqNum, enable, enableDump) { measure_.resize(MaxWave + 1); clear(measure_); @@ -234,7 +231,7 @@ void WLAlgorithmAvrg::outputTrace() { return; } - traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_ + traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ << " currWaves=" << currWaves_ << " waves=" << waves_ << " bestWave=" << bestWave_ << '\n'; output(traceStream_, "\n measure = ", measure_); @@ -279,17 +276,12 @@ void WLAlgorithmAvrg::callback(ulong duration) { } } -WaveLimiterManager::WaveLimiterManager(Kernel* kernel): +WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH): owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) { - auto gpuDev = static_cast(&owner_->dev()); - auto attrib = gpuDev->getAttribs(); - auto hwInfo = gpuDev->hwInfo(); - unsigned simdPerSH = 0; - setIfNotDefault(simdPerSH, GPU_WAVE_LIMIT_CU_PER_SH, - attrib.numberOfCUsperShaderArray * hwInfo->simdPerCU_); - fixed_ = GPU_WAVES_PER_SIMD * simdPerSH; + setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH); + fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_; } WaveLimiterManager::~WaveLimiterManager() { @@ -326,7 +318,7 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback( return loc->second; } - auto limiter = new WLAlgorithmSmooth(owner_, limiters_.size(), enable_, + auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_); if (limiter == NULL) { enable_ = false; @@ -336,26 +328,23 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback( return limiter; } -void WaveLimiterManager::enable() { +void WaveLimiterManager::enable(const bool isCiPlus) { if (fixed_ > 0) { return; } - auto gpuDev = static_cast(&owner_->dev()); - auto hwInfo = gpuDev->hwInfo(); + // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1 // Disabled for SI due to bug #10817 - if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) { enable_ = GPU_WAVE_LIMIT_ENABLE; } else { - if (gpuDev->settings().ciPlus_) { + if (isCiPlus) { if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) { enable_ = true; } else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) { - //Todo: - //fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_; + fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH(); } } } diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp b/rocclr/runtime/device/gpu/gpuwavelimiter.hpp index bc2a5fd7b5..07612fa451 100644 --- a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp +++ b/rocclr/runtime/device/gpu/gpuwavelimiter.hpp @@ -16,12 +16,12 @@ //! \namespace gpu GPU Device Implementation namespace gpu { -class Kernel; +class WaveLimiterManager; // Adaptively limit the number of waves per SIMD based on kernel execution time class WaveLimiter: public amd::ProfilingCallback { public: - explicit WaveLimiter(Kernel*, uint seqNum, bool enable, bool enableDump); + explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); virtual ~WaveLimiter(); //! Get waves per shader array to be used for kernel execution. @@ -57,7 +57,7 @@ protected: uint bestWave_; // Optimal waves per SIMD uint countAll_; // Number of kernel executions StateKind state_; - Kernel *owner_; + WaveLimiterManager* manager_; DataDumper dumper_; std::ofstream traceStream_; uint currWaves_; // Current waves per SIMD @@ -88,7 +88,7 @@ protected: class WLAlgorithmSmooth: public WaveLimiter { public: - explicit WLAlgorithmSmooth(Kernel* owner, uint seqNum, bool enable, bool enableDump); + explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); virtual ~WLAlgorithmSmooth(); private: std::vector reference_; @@ -117,7 +117,7 @@ private: class WLAlgorithmAvrg: public WaveLimiter { public: - explicit WLAlgorithmAvrg(Kernel* owner, uint seqNum, bool enable, bool enableDump); + explicit WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); virtual ~WLAlgorithmAvrg(); private: //! Call back from Event::recordProfilingInfo to get execution time. @@ -130,7 +130,7 @@ private: // Create wave limiter for each virtual device for a kernel and manages the wave limiters. class WaveLimiterManager { public: - explicit WaveLimiterManager(Kernel* owner); + explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH); virtual ~WaveLimiterManager(); //! Get waves per shader array for a specific virtual device. @@ -140,9 +140,17 @@ public: amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *); //! Enable wave limiter manager by kernel metadata and flags. - void enable(); + void enable(const bool isCiPlus); + + //! Returns the kernel name + const std::string& name() const { return owner_->name(); } + + //! Get SimdPerSH. + uint getSimdPerSH() const {return simdPerSH_;} + private: - Kernel *owner_; // The kernel which owns this object + device::Kernel *owner_; // The kernel which owns this object + uint simdPerSH_; // Simd Per SH std::unordered_map limiters_; // Maps virtual device to wave limiter bool enable_; // Whether the adaptation is enabled