From 6cc75de90f551df25b5752b1e0d75006583df026 Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 29 Aug 2018 18:54:19 -0400 Subject: [PATCH] P4 to Git Change 1599699 by gandryey@gera-w8 on 2018/08/29 18:43:02 SWDEV-79445 - OCL generic changes and code clean-up - Move WaveLimiter logic to the abstract layer. PAL version was taken as the base, thus performance of GSL path can be affected by this change Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#315 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/devwavelimiter.cpp#1 move/add ... //depot/stg/opencl/drivers/opencl/runtime/device/devwavelimiter.hpp#1 move/add ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#598 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#331 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#133 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.cpp#15 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.hpp#11 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#107 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#64 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#23 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.cpp#8 move/delete ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.hpp#8 move/delete --- rocclr/runtime/device/device.hpp | 1 + rocclr/runtime/device/devkernel.cpp | 38 ++- rocclr/runtime/device/devkernel.hpp | 53 +--- .../palwavelimiter.cpp => devwavelimiter.cpp} | 34 ++- .../palwavelimiter.hpp => devwavelimiter.hpp} | 23 +- rocclr/runtime/device/gpu/gpudevice.cpp | 1 + rocclr/runtime/device/gpu/gpukernel.cpp | 8 +- rocclr/runtime/device/gpu/gpukernel.hpp | 21 +- rocclr/runtime/device/gpu/gpuwavelimiter.cpp | 276 ------------------ rocclr/runtime/device/gpu/gpuwavelimiter.hpp | 151 ---------- rocclr/runtime/device/pal/paldevice.cpp | 1 + rocclr/runtime/device/pal/palkernel.cpp | 7 +- rocclr/runtime/device/pal/palkernel.hpp | 14 +- 13 files changed, 95 insertions(+), 533 deletions(-) rename rocclr/runtime/device/{pal/palwavelimiter.cpp => devwavelimiter.cpp} (80%) rename rocclr/runtime/device/{pal/palwavelimiter.hpp => devwavelimiter.hpp} (90%) delete mode 100644 rocclr/runtime/device/gpu/gpuwavelimiter.cpp delete mode 100644 rocclr/runtime/device/gpu/gpuwavelimiter.hpp diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp index ec89e63f5b..13b7ac451b 100644 --- a/rocclr/runtime/device/device.hpp +++ b/rocclr/runtime/device/device.hpp @@ -425,6 +425,7 @@ struct Info : public amd::EmbeddedObject { //! that execute in parallel. All work items from the same work group must be //! executed by SIMDs in the same compute unit. cl_uint simdPerCU_; + cl_uint cuPerShaderArray_; //!< Number of CUs per shader array //! The maximum number of work items from the same work group that can be //! executed by a SIMD in parallel cl_uint simdWidth_; diff --git a/rocclr/runtime/device/devkernel.cpp b/rocclr/runtime/device/devkernel.cpp index 9c4b43f960..12ec537d31 100644 --- a/rocclr/runtime/device/devkernel.cpp +++ b/rocclr/runtime/device/devkernel.cpp @@ -23,7 +23,43 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD; namespace device { - // ================================================================================================ +// ================================================================================================ +Kernel::Kernel(const amd::Device& dev, const std::string& name) + : dev_(dev) + , name_(name) + , signature_(nullptr) + , waveLimiter_(this, dev.info().cuPerShaderArray_ * dev.info().cuPerShaderArray_) { + // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_)); + // Due to std::string not being able to be memset to 0 + workGroupInfo_.size_ = 0; + workGroupInfo_.compileSize_[0] = 0; + workGroupInfo_.compileSize_[1] = 0; + workGroupInfo_.compileSize_[2] = 0; + workGroupInfo_.localMemSize_ = 0; + workGroupInfo_.preferredSizeMultiple_ = 0; + workGroupInfo_.privateMemSize_ = 0; + workGroupInfo_.scratchRegs_ = 0; + workGroupInfo_.wavefrontPerSIMD_ = 0; + workGroupInfo_.wavefrontSize_ = 0; + workGroupInfo_.availableGPRs_ = 0; + workGroupInfo_.usedGPRs_ = 0; + workGroupInfo_.availableSGPRs_ = 0; + workGroupInfo_.usedSGPRs_ = 0; + workGroupInfo_.availableVGPRs_ = 0; + workGroupInfo_.usedVGPRs_ = 0; + workGroupInfo_.availableLDSSize_ = 0; + workGroupInfo_.usedLDSSize_ = 0; + workGroupInfo_.availableStackSize_ = 0; + workGroupInfo_.usedStackSize_ = 0; + workGroupInfo_.compileSizeHint_[0] = 0; + workGroupInfo_.compileSizeHint_[1] = 0; + workGroupInfo_.compileSizeHint_[2] = 0; + workGroupInfo_.compileVecTypeHint_ = ""; + workGroupInfo_.uniformWorkGroupSize_ = false; + workGroupInfo_.wavesPerSimdHint_ = 0; +} + +// ================================================================================================ bool Kernel::createSignature( const parameters_t& params, uint32_t numParameters, uint32_t version) { diff --git a/rocclr/runtime/device/devkernel.hpp b/rocclr/runtime/device/devkernel.hpp index 94200a92b0..59f7733fa9 100644 --- a/rocclr/runtime/device/devkernel.hpp +++ b/rocclr/runtime/device/devkernel.hpp @@ -7,6 +7,7 @@ #include "platform/context.hpp" #include "platform/object.hpp" #include "platform/memory.hpp" +#include "devwavelimiter.hpp" #if defined(WITH_LIGHTNING_COMPILER) namespace llvm { @@ -37,10 +38,6 @@ class Device; class KernelSignature; class NDRange; -struct ProfilingCallback : public amd::HeapObject { - virtual void callback(ulong duration, uint32_t waves) = 0; -}; - struct KernelParameterDescriptor { enum { Value = 0, @@ -124,39 +121,7 @@ class Kernel : public amd::HeapObject { }; //! Default constructor - Kernel(const amd::Device& dev, const std::string& name) - : dev_(dev) - , name_(name) - , signature_(nullptr) { - // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_)); - // Due to std::string not being able to be memset to 0 - workGroupInfo_.size_ = 0; - workGroupInfo_.compileSize_[0] = 0; - workGroupInfo_.compileSize_[1] = 0; - workGroupInfo_.compileSize_[2] = 0; - workGroupInfo_.localMemSize_ = 0; - workGroupInfo_.preferredSizeMultiple_ = 0; - workGroupInfo_.privateMemSize_ = 0; - workGroupInfo_.scratchRegs_ = 0; - workGroupInfo_.wavefrontPerSIMD_ = 0; - workGroupInfo_.wavefrontSize_ = 0; - workGroupInfo_.availableGPRs_ = 0; - workGroupInfo_.usedGPRs_ = 0; - workGroupInfo_.availableSGPRs_ = 0; - workGroupInfo_.usedSGPRs_ = 0; - workGroupInfo_.availableVGPRs_ = 0; - workGroupInfo_.usedVGPRs_ = 0; - workGroupInfo_.availableLDSSize_ = 0; - workGroupInfo_.usedLDSSize_ = 0; - workGroupInfo_.availableStackSize_ = 0; - workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.compileSizeHint_[0] = 0; - workGroupInfo_.compileSizeHint_[1] = 0; - workGroupInfo_.compileSizeHint_[2] = 0; - workGroupInfo_.compileVecTypeHint_ = ""; - workGroupInfo_.uniformWorkGroupSize_ = false; - workGroupInfo_.wavesPerSimdHint_ = 0; - } + Kernel(const amd::Device& dev, const std::string& name); //! Default destructor virtual ~Kernel(); @@ -196,13 +161,14 @@ class Kernel : public amd::HeapObject { size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; } //! Get profiling callback object - virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) { - return nullptr; - } + amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) { + return waveLimiter_.getProfilingCallback(vdev); + }; - virtual uint getWavesPerSH(const device::VirtualDevice* vdv) const { - return 0; - } + //! Get waves per shader array to be used for kernel execution. + uint getWavesPerSH(const device::VirtualDevice* vdev) const { + return waveLimiter_.getWavesPerSH(vdev); + }; //! Returns GPU device object, associated with this kernel const amd::Device& dev() const { return dev_; } @@ -272,6 +238,7 @@ class Kernel : public amd::HeapObject { amd::KernelSignature* signature_; //!< kernel signature std::string buildLog_; //!< build log std::vector printf_; //!< Format strings for GPU printf support + WaveLimiterManager waveLimiter_; //!< adaptively control number of waves union Flags { struct { diff --git a/rocclr/runtime/device/pal/palwavelimiter.cpp b/rocclr/runtime/device/devwavelimiter.cpp similarity index 80% rename from rocclr/runtime/device/pal/palwavelimiter.cpp rename to rocclr/runtime/device/devwavelimiter.cpp index 75cb0811cc..67d2380ffb 100644 --- a/rocclr/runtime/device/pal/palwavelimiter.cpp +++ b/rocclr/runtime/device/devwavelimiter.cpp @@ -1,20 +1,22 @@ // // Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. // -#include "device/pal/palkernel.hpp" -#include "device/pal/palwavelimiter.hpp" +#include "platform/command.hpp" +#include "device/devkernel.hpp" +#include "device/devwavelimiter.hpp" #include "os/os.hpp" #include "utils/flags.hpp" #include using namespace std; -namespace pal { +namespace device { uint WaveLimiter::MaxWave; uint WaveLimiter::RunCount; uint WaveLimiter::AdaptCount; +// ================================================================================================ WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump) : manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) { setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH()); @@ -36,12 +38,14 @@ WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, numContinuousSamples_ = 0; } +// ================================================================================================ WaveLimiter::~WaveLimiter() { if (traceStream_.is_open()) { traceStream_.close(); } } +// ================================================================================================ uint WaveLimiter::getWavesPerSH() { // Generate different wave counts in the adaptation mode if ((state_ == ADAPT) && (sampleCount_ < AdaptCount)) { @@ -66,6 +70,7 @@ uint WaveLimiter::getWavesPerSH() { return waves_ * SIMDPerSH_; } +// ================================================================================================ WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump) : WaveLimiter(manager, seqNum, enable, enableDump) { @@ -78,8 +83,10 @@ WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, b clearData(); } +// ================================================================================================ WLAlgorithmSmooth::~WLAlgorithmSmooth() {} +// ================================================================================================ void WLAlgorithmSmooth::clearData() { waves_ = MaxWave; countAll_ = 0; @@ -88,10 +95,11 @@ void WLAlgorithmSmooth::clearData() { dataCount_ = 0; } +// ================================================================================================ void WLAlgorithmSmooth::updateData(ulong time) { - } +// ================================================================================================ void WLAlgorithmSmooth::outputTrace() { if (!traceStream_.is_open()) { return; @@ -114,7 +122,7 @@ void WLAlgorithmSmooth::outputTrace() { traceStream_ << "\n\n"; } - +// ================================================================================================ void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) { dumper_.addData(duration, waves, static_cast(state_)); @@ -212,6 +220,7 @@ void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) { } } +// ================================================================================================ WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) { enable_ = enable; if (enable_) { @@ -219,6 +228,7 @@ WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) } } +// ================================================================================================ WaveLimiter::DataDumper::~DataDumper() { if (!enable_) { return; @@ -232,6 +242,7 @@ WaveLimiter::DataDumper::~DataDumper() { OFS.close(); } +// ================================================================================================ void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) { if (!enable_) { return; @@ -242,18 +253,24 @@ void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) { state_.push_back(state); } +// ================================================================================================ WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH) : owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) { - setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH); + setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, ((simdPerSH == 0) ? 1 : simdPerSH)); fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_; } +// ================================================================================================ WaveLimiterManager::~WaveLimiterManager() { for (auto& I : limiters_) { delete I.second; } } +// ================================================================================================ +const std::string& WaveLimiterManager::name() const { return owner_->name(); } + +// ================================================================================================ uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const { if (fixed_ > 0) { return fixed_; @@ -291,7 +308,8 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback( return limiter; } -void WaveLimiterManager::enable() { +// ================================================================================================ +void WaveLimiterManager::enable(bool isSupported) { if (fixed_ > 0) { return; } @@ -300,7 +318,7 @@ void WaveLimiterManager::enable() { // Disabled for SI due to bug #10817 if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) { enable_ = GPU_WAVE_LIMIT_ENABLE; - } else { + } else if (isSupported) { if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) { enable_ = true; } else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) { diff --git a/rocclr/runtime/device/pal/palwavelimiter.hpp b/rocclr/runtime/device/devwavelimiter.hpp similarity index 90% rename from rocclr/runtime/device/pal/palwavelimiter.hpp rename to rocclr/runtime/device/devwavelimiter.hpp index 6caea9eb79..e13aaee8c0 100644 --- a/rocclr/runtime/device/pal/palwavelimiter.hpp +++ b/rocclr/runtime/device/devwavelimiter.hpp @@ -3,7 +3,6 @@ // #pragma once -#include "platform/command.hpp" #include "thread/thread.hpp" #include #include @@ -11,11 +10,17 @@ #include #include +namespace amd { + struct ProfilingCallback : public amd::HeapObject { + virtual void callback(ulong duration, uint32_t waves) = 0; + }; +} + //! \namespace pal PAL Device Implementation -namespace pal { +namespace device { class WaveLimiterManager; -class HSAILKernel; +class Kernel; // Adaptively limit the number of waves per SIMD based on kernel execution time class WaveLimiter : public amd::ProfilingCallback { @@ -120,20 +125,20 @@ class WLAlgorithmSmooth : public WaveLimiter { // Create wave limiter for each virtual device for a kernel and manages the wave limiters. class WaveLimiterManager { public: - explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH); + explicit WaveLimiterManager(Kernel* owner, const uint simdPerSH); virtual ~WaveLimiterManager(); //! Get waves per shader array for a specific virtual device. - uint getWavesPerSH(const device::VirtualDevice*) const; + uint getWavesPerSH(const VirtualDevice*) const; //! Provide call back function for a specific virtual device. - amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice*); + amd::ProfilingCallback* getProfilingCallback(const VirtualDevice*); //! Enable wave limiter manager by kernel metadata and flags. - void enable(); + void enable(bool isSupported = true); //! Returns the kernel name - const std::string& name() const { return owner_->name(); } + const std::string& name() const; //! Get SimdPerSH. uint getSimdPerSH() const { return simdPerSH_; } @@ -141,7 +146,7 @@ class WaveLimiterManager { private: device::Kernel* owner_; // The kernel which owns this object uint simdPerSH_; // Simd Per SH - std::unordered_map + std::unordered_map limiters_; // Maps virtual device to wave limiter bool enable_; // Whether the adaptation is enabled bool enableDump_; // Whether the data dumper is enabled diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp index 2d95f4eae7..de937f6e5a 100644 --- a/rocclr/runtime/device/gpu/gpudevice.cpp +++ b/rocclr/runtime/device/gpu/gpudevice.cpp @@ -565,6 +565,7 @@ void NullDevice::fillDeviceInfo(const CALdeviceattribs& calAttr, const gslMemInf info_.deviceTopology_.pcie.function = (calAttr.pciTopologyInformation & 0x07); info_.simdPerCU_ = hwInfo()->simdPerCU_; + info_.cuPerShaderArray_ = calAttr.numberOfCUsperShaderArray; info_.simdWidth_ = hwInfo()->simdWidth_; info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_; info_.wavefrontWidth_ = calAttr.wavefrontSize; diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp index fe88dbd499..68d7a777eb 100644 --- a/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/rocclr/runtime/device/gpu/gpukernel.cpp @@ -810,9 +810,7 @@ bool Kernel::create(const std::string& code, const std::string& metadata, const Kernel::Kernel(const std::string& name, const Device& gpuDev, const Program& prog, const InitData* initData) - : NullKernel(name, gpuDev, prog), - waveLimiter_(this, - dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_) { + : NullKernel(name, gpuDev, prog) { hwPrivateSize_ = 0; if (NULL != initData) { flags_ = initData->flags_; @@ -3054,9 +3052,7 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi code_(NULL), codeSize_(0), hwMetaData_(NULL), - extraArgumentsNum_(extraArgsNum), - waveLimiter_(this, (prog->isNull() ? 1 : dev().getAttribs().numberOfCUsperShaderArray) * - dev().hwInfo()->simdPerCU_) { + extraArgumentsNum_(extraArgsNum) { flags_.hsa_ = true; } diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp index 544cc9e9e7..a60ada3dad 100644 --- a/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/rocclr/runtime/device/gpu/gpukernel.hpp @@ -15,7 +15,7 @@ #include "device/gpu/gpuvirtual.hpp" #include "amd_hsa_kernel_code.h" #include "device/gpu/gpuprintf.hpp" -#include "device/gpu/gpuwavelimiter.hpp" +#include "device/devwavelimiter.hpp" #include "hsa.h" namespace amd { @@ -608,11 +608,6 @@ class Kernel : public NullKernel { VirtualGPU::GslKernelDesc* desc //!< Kernel descriptor ) const; - //! Get profiling callback object - virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) { - return waveLimiter_.getProfilingCallback(vdev); - } - protected: //! Initializes the kernel parameters for the abstraction layer bool initParameters(); @@ -707,8 +702,6 @@ class Kernel : public NullKernel { uint hwPrivateSize_; //!< initial HW private size uint hwLocalSize_; //!< initial HW local size - - WaveLimiterManager waveLimiter_; //!< adaptively control number of waves }; enum HSAIL_ADDRESS_QUALIFIER { @@ -833,16 +826,6 @@ class HSAILKernel : public device::Kernel { //! Returns kernel's extra argument count uint extraArgumentsNum() const { return extraArgumentsNum_; } - //! Get profiling callback object - virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) { - return waveLimiter_.getProfilingCallback(vdev); - } - - //! Get waves per shader array to be used for kernel execution. - uint getWavesPerSH(const device::VirtualDevice* vdev) const { - return waveLimiter_.getWavesPerSH(vdev); - } - private: //! Disable copy constructor HSAILKernel(const HSAILKernel&); @@ -873,8 +856,6 @@ class HSAILKernel : public device::Kernel { char* hwMetaData_; //!< SI metadata uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments - - WaveLimiterManager waveLimiter_; //!< adaptively control number of waves }; /*@}*/} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp b/rocclr/runtime/device/gpu/gpuwavelimiter.cpp deleted file mode 100644 index b110f41633..0000000000 --- a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// - -#include "device/gpu/gpukernel.hpp" -#include "device/gpu/gpuwavelimiter.hpp" -#include "os/os.hpp" -#include "utils/flags.hpp" - -#include -using namespace std; - -namespace gpu { - -uint WaveLimiter::MaxWave; -uint WaveLimiter::WarmUpCount; -uint WaveLimiter::RunCount; -uint WLAlgorithmSmooth::AdaptCount; -uint WLAlgorithmSmooth::AbandonThresh; -uint WLAlgorithmSmooth::DscThresh; - -WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump) - : manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) { - setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH()); - MaxWave = GPU_WAVE_LIMIT_MAX_WAVE; - WarmUpCount = GPU_WAVE_LIMIT_WARMUP; - RunCount = GPU_WAVE_LIMIT_RUN * MaxWave; - - state_ = WARMUP; - if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) { - traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + ".txt"); - } - - waves_ = MaxWave; - currWaves_ = MaxWave; - bestWave_ = MaxWave; - enable_ = enable; -} - -WaveLimiter::~WaveLimiter() { - if (traceStream_.is_open()) { - traceStream_.close(); - } -} - -uint WaveLimiter::getWavesPerSH() { - currWaves_ = waves_; - return waves_ * SIMDPerSH_; -} - -WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, - bool enableDump) - : WaveLimiter(manager, seqNum, enable, enableDump) { - AdaptCount = 2 * MaxWave + 1; - AbandonThresh = GPU_WAVE_LIMIT_ABANDON; - DscThresh = GPU_WAVE_LIMIT_DSC_THRESH; - - dynRunCount_ = RunCount; - measure_.resize(MaxWave + 1); - reference_.resize(MaxWave + 1); - trial_.resize(MaxWave + 1); - ratio_.resize(MaxWave + 1); - - clearData(); -} - -WLAlgorithmSmooth::~WLAlgorithmSmooth() {} - -void WLAlgorithmSmooth::clearData() { - waves_ = MaxWave; - countAll_ = 0; - clear(measure_); - clear(reference_); - clear(trial_); - clear(ratio_); - discontinuous_ = false; - dataCount_ = 0; -} - -void WLAlgorithmSmooth::updateData(ulong time) { - auto count = dataCount_ - 1; - assert(count < 2 * MaxWave + 1); - assert(time > 0); - assert(currWaves_ == waves_); - if (count % 2 == 0) { - assert(waves_ == MaxWave); - auto pos = count / 2; - measure_[pos] = time; - if (pos > 0) { - auto wave = MaxWave + 1 - pos; - if (abs(static_cast(measure_[pos - 1]) - static_cast(measure_[pos])) * 100 / - measure_[pos] > - DscThresh) { - discontinuous_ = true; - } - reference_[wave] = (time + measure_[pos - 1]) / 2; - ratio_[wave] = trial_[wave] * 100 / reference_[wave]; - if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) { - bestWave_ = wave; - } - } - } else { - assert(waves_ == MaxWave - count / 2); - trial_[waves_] = time; - } - outputTrace(); -} - -void WLAlgorithmSmooth::outputTrace() { - if (!traceStream_.is_open()) { - return; - } - - traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ - << " currWaves=" << currWaves_ << " waves=" << waves_ << " bestWave=" << bestWave_ - << '\n'; - output(traceStream_, "\n measure = ", measure_); - output(traceStream_, "\n reference = ", reference_); - output(traceStream_, "\n ratio = ", ratio_); - traceStream_ << "\n\n"; -} - - -void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) { - dumper_.addData(duration, currWaves_, static_cast(state_)); - - if (!enable_ || (duration == 0)) { - return; - } - - countAll_++; - - switch (state_) { - case WARMUP: - if (countAll_ < WarmUpCount) { - return; - } - state_ = ADAPT; - bestWave_ = MaxWave; - clearData(); - return; - case ADAPT: - assert(duration > 0); - if (waves_ == currWaves_) { - dataCount_++; - updateData(duration); - waves_ = MaxWave + 1 - dataCount_ / 2; - if (dataCount_ == 1 || (dataCount_ < AdaptCount && !discontinuous_ && - (dataCount_ % 2 == 0 || ratio_[waves_] < AbandonThresh))) { - if (dataCount_ % 2 == 1) { - --waves_; - } else { - waves_ = MaxWave; - } - return; - } - waves_ = bestWave_; - if (dataCount_ >= AdaptCount) { - dynRunCount_ = RunCount; - } else { - dynRunCount_ = AdaptCount; - } - countAll_ = rand() % MaxWave; - state_ = RUN; - } - return; - case RUN: - if (countAll_ < dynRunCount_) { - return; - } - state_ = ADAPT; - bestWave_ = MaxWave; - clearData(); - return; - } -} - -WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) { - enable_ = enable; - if (enable_) { - fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv"; - } -} - -WaveLimiter::DataDumper::~DataDumper() { - if (!enable_) { - return; - } - - std::ofstream OFS(fileName_); - for (size_t i = 0, e = time_.size(); i != e; ++i) { - OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' << static_cast(state_[i]) - << '\n'; - } - OFS.close(); -} - -void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) { - if (!enable_) { - return; - } - - time_.push_back(time); - wavePerSIMD_.push_back(wave); - state_.push_back(state); -} - -WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH) - : owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) { - setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH); - fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_; -} - -WaveLimiterManager::~WaveLimiterManager() { - for (auto& I : limiters_) { - delete I.second; - } -} - -uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const { - if (fixed_ > 0) { - return fixed_; - } - if (!enable_) { - return 0; - } - auto loc = limiters_.find(vdev); - if (loc == limiters_.end()) { - return 0; - } - assert(loc->second != NULL); - return loc->second->getWavesPerSH(); -} - -amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback( - const device::VirtualDevice* vdev) { - assert(vdev != NULL); - if (!enable_ && !enableDump_) { - return NULL; - } - - amd::ScopedLock SL(monitor_); - auto loc = limiters_.find(vdev); - if (loc != limiters_.end()) { - return loc->second; - } - - auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_); - if (limiter == NULL) { - enable_ = false; - return NULL; - } - limiters_[vdev] = limiter; - return limiter; -} - -void WaveLimiterManager::enable(const bool isCiPlus) { - if (fixed_ > 0) { - return; - } - - // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1 - // Disabled for SI due to bug #10817 - if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) { - enable_ = GPU_WAVE_LIMIT_ENABLE; - } else { - if (isCiPlus) { - if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) { - enable_ = true; - } else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) { - fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH(); - } - } - } -} -} diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp b/rocclr/runtime/device/gpu/gpuwavelimiter.hpp deleted file mode 100644 index 570a457d62..0000000000 --- a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp +++ /dev/null @@ -1,151 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// - -#ifndef GPUWAVELIMITER_HPP_ -#define GPUWAVELIMITER_HPP_ - -#include "platform/command.hpp" -#include "thread/thread.hpp" -#include -#include -#include -#include -#include - -//! \namespace gpu GPU Device Implementation -namespace gpu { - -class WaveLimiterManager; - -// Adaptively limit the number of waves per SIMD based on kernel execution time -class WaveLimiter : public amd::ProfilingCallback { - public: - explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); - virtual ~WaveLimiter(); - - //! Get waves per shader array to be used for kernel execution. - uint getWavesPerSH(); - - protected: - enum StateKind { WARMUP, ADAPT, RUN }; - - class DataDumper { - public: - explicit DataDumper(const std::string& kernelName, bool enable); - ~DataDumper(); - - //! Record execution time, waves/simd and state of wave limiter. - void addData(ulong time, uint wave, char state); - - //! Whether this data dumper is enabled. - bool enabled() const { return enable_; } - - private: - bool enable_; - std::string fileName_; - std::vector time_; - std::vector wavePerSIMD_; - std::vector state_; - }; - - std::vector measure_; - bool enable_; - uint SIMDPerSH_; // Number of SIMDs per SH - uint waves_; // Waves per SIMD to be set - uint bestWave_; // Optimal waves per SIMD - uint countAll_; // Number of kernel executions - StateKind state_; - WaveLimiterManager* manager_; - DataDumper dumper_; - std::ofstream traceStream_; - uint currWaves_; // Current waves per SIMD - - static uint MaxWave; // Maximum number of waves per SIMD - static uint WarmUpCount; // Number of kernel executions for warm up - static uint RunCount; // Number of kernel executions for normal run - - //! Call back from Event::recordProfilingInfo to get execution time. - virtual void callback(ulong duration, uint32_t waves) = 0; - - //! Output trace of measurement/adaptation. - virtual void outputTrace() = 0; - - template void clear(T& A) { - for (auto& I : A) { - I = 0; - } - } - template void output(std::ofstream& ofs, const std::string& prompt, T& A) { - ofs << prompt; - for (auto& I : A) { - ofs << ' ' << static_cast(I); - } - } -}; - -class WLAlgorithmSmooth : public WaveLimiter { - public: - explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, - bool enableDump); - virtual ~WLAlgorithmSmooth(); - - private: - std::vector reference_; - std::vector trial_; - std::vector ratio_; - bool discontinuous_; // Measured data is discontinuous - uint dynRunCount_; - uint dataCount_; - - static uint AdaptCount; // Number of kernel executions for adapting - static uint AbandonThresh; // Threshold to abandon adaptation - static uint DscThresh; // Threshold for identifying discontinuities - - //! Update measurement data and optimal waves/simd with execution time. - void updateData(ulong time); - - //! Clear measurement data for the next adaptation. - void clearData(); - - //! Call back from Event::recordProfilingInfo to get execution time. - void callback(ulong duration, uint32_t waves); - - //! Output trace of measurement/adaptation. - void outputTrace(); -}; - -// Create wave limiter for each virtual device for a kernel and manages the wave limiters. -class WaveLimiterManager { - public: - explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH); - virtual ~WaveLimiterManager(); - - //! Get waves per shader array for a specific virtual device. - uint getWavesPerSH(const device::VirtualDevice*) const; - - //! Provide call back function for a specific virtual device. - amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice*); - - //! Enable wave limiter manager by kernel metadata and flags. - void enable(const bool isCiPlus); - - //! Returns the kernel name - const std::string& name() const { return owner_->name(); } - - //! Get SimdPerSH. - uint getSimdPerSH() const { return simdPerSH_; } - - private: - device::Kernel* owner_; // The kernel which owns this object - uint simdPerSH_; // Simd Per SH - std::unordered_map - limiters_; // Maps virtual device to wave limiter - bool enable_; // Whether the adaptation is enabled - bool enableDump_; // Whether the data dumper is enabled - uint fixed_; // The fixed waves/simd value if not zero - amd::Monitor monitor_; // The mutex for updating the wave limiter map -}; -} -#endif diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp index 5d0a0bcd03..aaa4987605 100644 --- a/rocclr/runtime/device/pal/paldevice.cpp +++ b/rocclr/runtime/device/pal/paldevice.cpp @@ -574,6 +574,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.deviceTopology_.pcie.function = palProp.pciProperties.functionNumber; info_.simdPerCU_ = hwInfo()->simdPerCU_; + info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray; info_.simdWidth_ = hwInfo()->simdWidth_; info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_; info_.wavefrontWidth_ = palProp.gfxipProperties.shaderCore.wavefrontSize; diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp index 7c330fbbd4..fbfe429231 100644 --- a/rocclr/runtime/device/pal/palkernel.cpp +++ b/rocclr/runtime/device/pal/palkernel.cpp @@ -74,12 +74,7 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi prog_(*prog), index_(0), code_(0), - codeSize_(0), - waveLimiter_( - this, - (prog->isNull() ? 1 - : dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) * - dev().hwInfo()->simdPerCU_) { + codeSize_(0) { flags_.hsa_ = true; } diff --git a/rocclr/runtime/device/pal/palkernel.hpp b/rocclr/runtime/device/pal/palkernel.hpp index 43fc6ff185..c7477697c0 100644 --- a/rocclr/runtime/device/pal/palkernel.hpp +++ b/rocclr/runtime/device/pal/palkernel.hpp @@ -13,7 +13,7 @@ #include "device/pal/palvirtual.hpp" #include "amd_hsa_kernel_code.h" #include "device/pal/palprintf.hpp" -#include "device/pal/palwavelimiter.hpp" +#include "device/devwavelimiter.hpp" #include "hsa.h" #if defined(WITH_LIGHTNING_COMPILER) @@ -98,16 +98,6 @@ class HSAILKernel : public device::Kernel { //! Returns the kernel index in the program uint index() const { return index_; } - //! Get profiling callback object - virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) { - return waveLimiter_.getProfilingCallback(vdev); - }; - - //! Get waves per shader array to be used for kernel execution. - virtual uint getWavesPerSH(const device::VirtualDevice* vdev) const { - return waveLimiter_.getWavesPerSH(vdev); - }; - private: //! Disable copy constructor HSAILKernel(const HSAILKernel&); @@ -126,8 +116,6 @@ class HSAILKernel : public device::Kernel { uint64_t code_; //!< GPU memory pointer to the kernel size_t codeSize_; //!< Size of ISA code - - WaveLimiterManager waveLimiter_; //!< adaptively control number of waves }; #if defined(WITH_LIGHTNING_COMPILER)