From 5d9912f48bbe1a0c8831967c2d69148149186d76 Mon Sep 17 00:00:00 2001 From: German Date: Thu, 24 Aug 2023 18:29:40 -0400 Subject: [PATCH] SWDEV-407533 - [ABI Break]Remove Wavelimiter Change-Id: I6a2f6fb5a0c3acea93fa0200a69679783e76f5bd [ROCm/clr commit: 7be3a5e33e4824e3713c5cdaa4db5c4c6577c691] --- projects/clr/hipamd/src/hip_event.hpp | 1 - projects/clr/rocclr/cmake/ROCclr.cmake | 1 - projects/clr/rocclr/device/devkernel.cpp | 3 +- projects/clr/rocclr/device/devkernel.hpp | 12 - projects/clr/rocclr/device/devprogram.hpp | 1 - projects/clr/rocclr/device/devwavelimiter.cpp | 348 ------------------ projects/clr/rocclr/device/devwavelimiter.hpp | 173 --------- projects/clr/rocclr/device/pal/palkernel.cpp | 2 - projects/clr/rocclr/device/pal/palkernel.hpp | 1 - projects/clr/rocclr/device/pal/palvirtual.cpp | 13 +- projects/clr/rocclr/device/pal/palvirtual.hpp | 1 - projects/clr/rocclr/platform/command.cpp | 7 - projects/clr/rocclr/platform/command.hpp | 19 +- projects/clr/rocclr/utils/flags.hpp | 12 - 14 files changed, 10 insertions(+), 584 deletions(-) delete mode 100644 projects/clr/rocclr/device/devwavelimiter.cpp delete mode 100644 projects/clr/rocclr/device/devwavelimiter.hpp diff --git a/projects/clr/hipamd/src/hip_event.hpp b/projects/clr/hipamd/src/hip_event.hpp index df32e37262..1927242b0c 100644 --- a/projects/clr/hipamd/src/hip_event.hpp +++ b/projects/clr/hipamd/src/hip_event.hpp @@ -82,7 +82,6 @@ class EventMarker : public amd::Marker { int32_t scope = amd::Device::kCacheStateInvalid) : amd::Marker(stream, disableFlush) { profilingInfo_.enabled_ = true; - profilingInfo_.callback_ = nullptr; profilingInfo_.marker_ts_ = markerTs; profilingInfo_.clear(); setEventScope(scope); diff --git a/projects/clr/rocclr/cmake/ROCclr.cmake b/projects/clr/rocclr/cmake/ROCclr.cmake index 10fa979735..f89ec675f6 100644 --- a/projects/clr/rocclr/cmake/ROCclr.cmake +++ b/projects/clr/rocclr/cmake/ROCclr.cmake @@ -70,7 +70,6 @@ target_sources(rocclr PRIVATE ${ROCCLR_SRC_DIR}/device/device.cpp ${ROCCLR_SRC_DIR}/device/devkernel.cpp ${ROCCLR_SRC_DIR}/device/devprogram.cpp - ${ROCCLR_SRC_DIR}/device/devwavelimiter.cpp ${ROCCLR_SRC_DIR}/device/hsailctx.cpp ${ROCCLR_SRC_DIR}/elf/elf.cpp ${ROCCLR_SRC_DIR}/os/alloc.cpp diff --git a/projects/clr/rocclr/device/devkernel.cpp b/projects/clr/rocclr/device/devkernel.cpp index db2bdfb81c..5747e08258 100644 --- a/projects/clr/rocclr/device/devkernel.cpp +++ b/projects/clr/rocclr/device/devkernel.cpp @@ -589,8 +589,7 @@ Kernel::Kernel(const amd::Device& dev, const std::string& name, const Program& p : dev_(dev) , name_(name) , prog_(prog) - , signature_(nullptr) - , waveLimiter_(this, dev.info().cuPerShaderArray_ * dev.info().simdPerCU_) { + , signature_(nullptr) { // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_)); // Due to std::string not being able to be memset to 0 workGroupInfo_.size_ = 0; diff --git a/projects/clr/rocclr/device/devkernel.hpp b/projects/clr/rocclr/device/devkernel.hpp index ced898f598..dbdf1e34ff 100644 --- a/projects/clr/rocclr/device/devkernel.hpp +++ b/projects/clr/rocclr/device/devkernel.hpp @@ -26,7 +26,6 @@ #include "platform/context.hpp" #include "platform/object.hpp" #include "platform/memory.hpp" -#include "devwavelimiter.hpp" namespace amd { class Device; @@ -435,16 +434,6 @@ class Kernel : public amd::HeapObject { size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; } - //! Get profiling callback object - amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) { - return waveLimiter_.getProfilingCallback(vdev); - }; - - //! Get waves per shader array to be used for kernel execution. - uint getWavesPerSH(const device::VirtualDevice* vdev) const { - return waveLimiter_.getWavesPerSH(vdev); - }; - //! Returns GPU device object, associated with this kernel const amd::Device& device() const { return dev_; } @@ -567,7 +556,6 @@ class Kernel : public amd::HeapObject { amd::KernelSignature* signature_; //!< kernel signature std::string buildLog_; //!< build log std::vector printf_; //!< Format strings for GPU printf support - WaveLimiterManager waveLimiter_; //!< adaptively control number of waves std::string runtimeHandle_; //!< Runtime handle for context loader uint64_t kernelCodeHandle_ = 0; //!< Kernel code handle (aka amd_kernel_code_t) diff --git a/projects/clr/rocclr/device/devprogram.hpp b/projects/clr/rocclr/device/devprogram.hpp index 55eac9984e..78554f0c64 100644 --- a/projects/clr/rocclr/device/devprogram.hpp +++ b/projects/clr/rocclr/device/devprogram.hpp @@ -26,7 +26,6 @@ #include "platform/context.hpp" #include "platform/object.hpp" #include "platform/memory.hpp" -#include "devwavelimiter.hpp" #if defined(USE_COMGR_LIBRARY) #include "amd_comgr/amd_comgr.h" diff --git a/projects/clr/rocclr/device/devwavelimiter.cpp b/projects/clr/rocclr/device/devwavelimiter.cpp deleted file mode 100644 index deaadd98e2..0000000000 --- a/projects/clr/rocclr/device/devwavelimiter.cpp +++ /dev/null @@ -1,348 +0,0 @@ -/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#include "platform/command.hpp" -#include "device/devkernel.hpp" -#include "device/devwavelimiter.hpp" -#include "os/os.hpp" -#include "utils/flags.hpp" - -#include -using namespace std; - -namespace device { - -uint WaveLimiter::MaxWave; -uint WaveLimiter::RunCount; -uint WaveLimiter::AdaptCount; - -// ================================================================================================ -WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump) - : manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) { - setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH()); - MaxWave = GPU_WAVE_LIMIT_MAX_WAVE; - RunCount = GPU_WAVE_LIMIT_RUN * MaxWave; - AdaptCount = MaxContinuousSamples * 2 * (MaxWave + 1); - - state_ = WARMUP; - if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) { - traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + ".txt"); - } - - waves_ = MaxWave; - enable_ = (SIMDPerSH_ == 0) ? false : enable; - bestWave_ = (enable_) ? MaxWave : 0; - worstWave_ = 0; - sampleCount_ = 0; - resultCount_ = 0; - numContinuousSamples_ = 0; -} - -// ================================================================================================ -WaveLimiter::~WaveLimiter() { - if (traceStream_.is_open()) { - traceStream_.close(); - } -} - -// ================================================================================================ -uint WaveLimiter::getWavesPerSH() { - // Generate different wave counts in the adaptation mode - if ((state_ == ADAPT) && (sampleCount_ < AdaptCount)) { - if (numContinuousSamples_ == 0) { - ++waves_; - waves_ %= MaxWave + 1; - // Don't execute the wave count with the worst performance - if (waves_ != 0) { - while (worstWave_ >= waves_) { - ++waves_; - waves_ %= MaxWave + 1; - } - } - } - ++numContinuousSamples_; - numContinuousSamples_ %= MaxContinuousSamples; - ++sampleCount_; - } - else { - waves_ = bestWave_; - } - return waves_ * SIMDPerSH_; -} - -// ================================================================================================ -WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, - bool enableDump) - : WaveLimiter(manager, seqNum, enable, enableDump) { - dynRunCount_ = RunCount; - adpMeasure_.resize(MaxWave + 1); - adpSampleCnt_.resize(MaxWave + 1); - runMeasure_.resize(MaxWave + 1); - runSampleCnt_.resize(MaxWave + 1); - - clearData(); -} - -// ================================================================================================ -WLAlgorithmSmooth::~WLAlgorithmSmooth() {} - -// ================================================================================================ -void WLAlgorithmSmooth::clearData() { - waves_ = MaxWave; - countAll_ = 0; - clear(adpMeasure_); - clear(adpSampleCnt_); - dataCount_ = 0; -} - -// ================================================================================================ -void WLAlgorithmSmooth::updateData(ulong time) { -} - -// ================================================================================================ -void WLAlgorithmSmooth::outputTrace() { - if (!traceStream_.is_open()) { - return; - } - - traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ << - " waves=" << waves_ << " bestWave=" << bestWave_ << " worstWave=" << worstWave_ << '\n'; - output(traceStream_, "\n adaptive measure = ", adpMeasure_); - output(traceStream_, "\n adaptive sample count = ", adpSampleCnt_); - output(traceStream_, "\n run measure = ", runMeasure_); - output(traceStream_, "\n run sample count = ", runSampleCnt_); - traceStream_ << "\n % time from the previous runs to the best wave: "; - float min = static_cast(adpMeasure_[bestWave_]) / adpSampleCnt_[bestWave_]; - for (uint i = 0; i < (MaxWave + 1); ++i) { - runSampleCnt_[i] = (runSampleCnt_[i] == 0) ? 1 : runSampleCnt_[i]; - float average = static_cast(runMeasure_[i]) / runSampleCnt_[i]; - traceStream_ << (average * 100 / min) << " "; - } - traceStream_ << "\n run count = " << dynRunCount_; - traceStream_ << "\n\n"; -} - -// ================================================================================================ -void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) { - dumper_.addData(duration, waves, static_cast(state_)); - - if (!enable_ || (duration == 0)) { - return; - } - - countAll_++; - - waves /= SIMDPerSH_; - // Collect the time for the current wave count - runMeasure_[waves] += duration; - runSampleCnt_[waves]++; - - switch (state_) { - case ADAPT: - assert(duration > 0); - // Wave count 0 indicates the satrt of adaptation - if ((waves == 0) || (resultCount_ > 0)) { - // Scale time to us - adpMeasure_[waves] += duration; - adpSampleCnt_[waves]++; - resultCount_++; - // If the end of adaptation is reached, then analyze the results - if (resultCount_ == AdaptCount) { - // Reset the counters - resultCount_ = sampleCount_ = 0; - float min = std::numeric_limits::max(); - float max = std::numeric_limits::min(); - uint32_t best = bestWave_; - // Check performance for the previous run if it's available - if (runSampleCnt_[bestWave_] > 0) { - min = static_cast(runMeasure_[bestWave_]) / runSampleCnt_[bestWave_]; - } - else if (adpSampleCnt_[MaxWave] > 0) { - min = static_cast(adpMeasure_[MaxWave]) / adpSampleCnt_[MaxWave]; - bestWave_ = MaxWave; - } - // Find the fastest average time - float reference = min; - for (uint i = MaxWave; i > 0; --i) { - float average; - if (adpSampleCnt_[i] > 0) { - average = static_cast(adpMeasure_[i]) / adpSampleCnt_[i]; - } - else { - average = 0.0f; - } - // More waves have 5% advantage over the lower number - if (average * 1.05f < min) { - min = average; - bestWave_ = i; - } - if (average > max) { - max = average; - worstWave_ = i; - } - } - // Check for 5% acceptance - if ((min * 1.05f > reference) || (bestWave_ == best)) { - bestWave_ = best; - // Increase the run time if the same wave count is the best - dynRunCount_ += RunCount; - dynRunCount_++; - } - else { - dynRunCount_ = RunCount; - } - // Find the middle between the best and the worst - if (worstWave_ < bestWave_) { - worstWave_ += ((bestWave_ - worstWave_) >> 1); - } else { - worstWave_ = 0; - } - state_ = RUN; - outputTrace(); - // Start to collect the new data for the best wave - countAll_ = 0; - runMeasure_[bestWave_] = 0; - runSampleCnt_[bestWave_] = 0; - } - } - return; - case WARMUP: - case RUN: - if (countAll_ < dynRunCount_) { - return; - } - if (state_ == WARMUP) { - runSampleCnt_[bestWave_] = 0; - } - state_ = ADAPT; - clearData(); - return; - } -} - -// ================================================================================================ -WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) { - enable_ = enable; - if (enable_) { - fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv"; - } -} - -// ================================================================================================ -WaveLimiter::DataDumper::~DataDumper() { - if (!enable_) { - return; - } - - std::ofstream OFS(fileName_); - for (size_t i = 0, e = time_.size(); i != e; ++i) { - OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' << static_cast(state_[i]) - << '\n'; - } - OFS.close(); -} - -// ================================================================================================ -void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) { - if (!enable_) { - return; - } - - time_.push_back(time); - wavePerSIMD_.push_back(wave); - state_.push_back(state); -} - -// ================================================================================================ -WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH) - : owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) { - setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, ((simdPerSH == 0) ? 1 : simdPerSH)); - fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_; -} - -// ================================================================================================ -WaveLimiterManager::~WaveLimiterManager() { - for (auto& I : limiters_) { - delete I.second; - } -} - -// ================================================================================================ -const std::string& WaveLimiterManager::name() const { return owner_->name(); } - -// ================================================================================================ -uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const { - if (fixed_ > 0) { - return fixed_; - } - if (!enable_) { - return 0; - } - auto loc = limiters_.find(vdev); - if (loc == limiters_.end()) { - return 0; - } - assert(loc->second != nullptr); - return loc->second->getWavesPerSH(); -} - -amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback( - const device::VirtualDevice* vdev) { - assert(vdev != nullptr); - if (!enable_ && !enableDump_) { - return nullptr; - } - - amd::ScopedLock SL(monitor_); - auto loc = limiters_.find(vdev); - if (loc != limiters_.end()) { - return loc->second; - } - - auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_); - if (limiter == nullptr) { - enable_ = false; - return nullptr; - } - limiters_[vdev] = limiter; - return limiter; -} - -// ================================================================================================ -void WaveLimiterManager::enable(bool isSupported) { - if (fixed_ > 0) { - enable_ = GPU_WAVE_LIMIT_ENABLE; - return; - } - - // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1 - // Disabled for SI due to bug #10817 - if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) { - enable_ = GPU_WAVE_LIMIT_ENABLE; - } else if (isSupported) { - if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) { - enable_ = true; - } else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) { - fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH(); - } - } -} - -} // namespace pal diff --git a/projects/clr/rocclr/device/devwavelimiter.hpp b/projects/clr/rocclr/device/devwavelimiter.hpp deleted file mode 100644 index cba01bb2db..0000000000 --- a/projects/clr/rocclr/device/devwavelimiter.hpp +++ /dev/null @@ -1,173 +0,0 @@ -/* Copyright (c) 2008 - 2021 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#pragma once - -#include "thread/thread.hpp" -#include -#include -#include -#include -#include - -namespace amd { - struct ProfilingCallback : public amd::HeapObject { - virtual void callback(ulong duration, uint32_t waves) = 0; - }; -} - -//! \namespace pal PAL Device Implementation -namespace device { - -class WaveLimiterManager; -class Kernel; - -// Adaptively limit the number of waves per SIMD based on kernel execution time -class WaveLimiter : public amd::ProfilingCallback { - public: - explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); - virtual ~WaveLimiter(); - - //! Get waves per shader array to be used for kernel execution. - uint getWavesPerSH(); - - protected: - enum StateKind { WARMUP, ADAPT, RUN }; - - class DataDumper { - public: - explicit DataDumper(const std::string& kernelName, bool enable); - ~DataDumper(); - - //! Record execution time, waves/simd and state of wave limiter. - void addData(ulong time, uint wave, char state); - - //! Whether this data dumper is enabled. - bool enabled() const { return enable_; } - - private: - bool enable_; - std::string fileName_; - std::vector time_; - std::vector wavePerSIMD_; - std::vector state_; - }; - - bool enable_; - uint SIMDPerSH_; // Number of SIMDs per SH - uint waves_; // Waves per SIMD to be set - uint bestWave_; // Optimal waves per SIMD - uint worstWave_; // Wave number with the worst performance - uint countAll_; // Number of kernel executions - StateKind state_; - WaveLimiterManager* manager_; - DataDumper dumper_; - std::ofstream traceStream_; - uint32_t sampleCount_; //!< The number of samples for adaptive mode - uint32_t resultCount_; //!< The number of results for adaptive mode - uint32_t numContinuousSamples_; //!< The number of samples with the same wave count - - static uint MaxWave; // Maximum number of waves per SIMD - static uint RunCount; // Number of kernel executions for normal run - static uint AdaptCount; // Number of kernel executions for adapting - static constexpr uint MaxContinuousSamples = 2; - - //! Call back from Event::recordProfilingInfo to get execution time. - virtual void callback(ulong duration, uint32_t waves) = 0; - - //! Output trace of measurement/adaptation. - virtual void outputTrace() = 0; - - template void clear(T& A) { - uint idx = 0; - for (auto& I : A) { - if (idx > worstWave_) { - I = 0; - } - ++idx; - } - } - template void output(std::ofstream& ofs, const std::string& prompt, T& A) { - ofs << prompt; - for (auto& I : A) { - ofs << ' ' << static_cast(I); - } - } -}; - -class WLAlgorithmSmooth : public WaveLimiter { - public: - explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, - bool enableDump); - virtual ~WLAlgorithmSmooth(); - - private: - std::vector adpMeasure_; //!< Accumulated performance in the adaptation mode - std::vector adpSampleCnt_; //!< The number of samples in the adaptation mode - std::vector runMeasure_; //!< Accumulated performance in the run mode - std::vector runSampleCnt_; //!< The number of samples in the run mode - uint dynRunCount_; - uint dataCount_; - - //! Update measurement data and optimal waves/simd with execution time. - void updateData(ulong time); - - //! Clear measurement data for the next adaptation. - void clearData(); - - //! Call back from Event::recordProfilingInfo to get execution time. - void callback(ulong duration, uint32_t waves) override; - - //! Output trace of measurement/adaptation. - void outputTrace() override; -}; - -// Create wave limiter for each virtual device for a kernel and manages the wave limiters. -class WaveLimiterManager { - public: - explicit WaveLimiterManager(Kernel* owner, const uint simdPerSH); - virtual ~WaveLimiterManager(); - - //! Get waves per shader array for a specific virtual device. - uint getWavesPerSH(const VirtualDevice*) const; - - //! Provide call back function for a specific virtual device. - amd::ProfilingCallback* getProfilingCallback(const VirtualDevice*); - - //! Enable wave limiter manager by kernel metadata and flags. - void enable(bool isSupported = true); - - //! Returns the kernel name - const std::string& name() const; - - //! Get SimdPerSH. - uint getSimdPerSH() const { return simdPerSH_; } - - private: - device::Kernel* owner_; // The kernel which owns this object - uint simdPerSH_; // Simd Per SH - std::unordered_map - limiters_; // Maps virtual device to wave limiter - bool enable_; // Whether the adaptation is enabled - bool enableDump_; // Whether the data dumper is enabled - uint fixed_; // The fixed waves/simd value if not zero - amd::Monitor monitor_; // The mutex for updating the wave limiter map -}; -} diff --git a/projects/clr/rocclr/device/pal/palkernel.cpp b/projects/clr/rocclr/device/pal/palkernel.cpp index 1c0bd5130c..fe5672e2fc 100644 --- a/projects/clr/rocclr/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/device/pal/palkernel.cpp @@ -224,8 +224,6 @@ bool HSAILKernel::init() { return false; } - waveLimiter_.enable(); - size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_); error = amd::Hsail::QueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT, openClKernelName.c_str(), workGroupInfo_.compileSizeHint_, diff --git a/projects/clr/rocclr/device/pal/palkernel.hpp b/projects/clr/rocclr/device/pal/palkernel.hpp index 2956e7d5ff..7528713c27 100644 --- a/projects/clr/rocclr/device/pal/palkernel.hpp +++ b/projects/clr/rocclr/device/pal/palkernel.hpp @@ -30,7 +30,6 @@ #include "device/pal/palvirtual.hpp" #include "amd_hsa_kernel_code.h" #include "device/pal/palprintf.hpp" -#include "device/devwavelimiter.hpp" #include "hsa.h" namespace amd { diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 4c82dfdf9c..7a095f0117 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -2473,8 +2473,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { // Submit kernel to HW if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, - &vcmd.event(), vcmd.sharedMemBytes(), - vcmd.cooperativeGroups())) { + vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) { vcmd.setStatus(CL_INVALID_OPERATION); } @@ -2489,7 +2488,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { profilingBegin(vcmd); // Submit kernel to HW - if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(), + if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) { vcmd.setStatus(CL_INVALID_OPERATION); } @@ -2499,9 +2498,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { } // ================================================================================================ -bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, - const_address parameters, bool nativeMem, - amd::Event* enqueueEvent, uint32_t sharedMemBytes, +bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, + const amd::Kernel& kernel, const_address parameters, + bool nativeMem, uint32_t sharedMemBytes, bool cooperativeGroup) { size_t newOffset[3] = {0, 0, 0}; size_t newGlobalSize[3] = {0, 0, 0}; @@ -2648,7 +2647,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode(); dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress(); - dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0; + dispatchParam.wavesPerSh = 0; dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false; dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize(); #ifdef PAL_DEBUGGER diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index 2595342c74..08bfa0e69a 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -311,7 +311,6 @@ class VirtualGPU : public device::VirtualDevice { const amd::Kernel& kernel, //!< Kernel for execution const_address parameters, //!< Parameters for the kernel bool nativeMem = true, //!< Native memory objects - amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command uint32_t sharedMemBytes = 0, //!< Shared memory size bool cooperativeGroups = false //!< TRUE if cooperative groups mode is required ); diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index 0371d8dc57..d67f951e29 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -101,10 +101,6 @@ uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) { break; default: profilingInfo_.end_ = timeStamp; - if (profilingInfo_.callback_ != nullptr) { - profilingInfo_.callback_->callback(timeStamp - profilingInfo_.start_, - profilingInfo_.waves_); - } break; } return timeStamp; @@ -429,15 +425,12 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList firstDevice_(firstDevice) { auto& device = queue.device(); auto devKernel = const_cast(kernel.getDeviceKernel(device)); - profilingInfo_.setCallback(devKernel->getProfilingCallback( - queue.vdev()), devKernel->getWavesPerSH(queue.vdev())); if (cooperativeGroups()) { setNumWorkgroups(); } if (forceProfiling) { profilingInfo_.enabled_ = true; profilingInfo_.clear(); - profilingInfo_.callback_ = nullptr; profilingInfo_.marker_ts_ = true; } kernel_.retain(); diff --git a/projects/clr/rocclr/platform/command.hpp b/projects/clr/rocclr/platform/command.hpp index b45c9067ae..ddb9a9f170 100644 --- a/projects/clr/rocclr/platform/command.hpp +++ b/projects/clr/rocclr/platform/command.hpp @@ -104,10 +104,9 @@ class Event : public RuntimeObject { static const EventWaitList nullWaitList; struct ProfilingInfo { - ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(false) { + ProfilingInfo(bool enabled = false) : enabled_(enabled), marker_ts_(false) { if (enabled) { clear(); - callback_ = nullptr; correlation_id_ = activity_prof::correlation_id; } } @@ -116,11 +115,9 @@ class Event : public RuntimeObject { uint64_t submitted_; uint64_t start_; uint64_t end_; - bool enabled_; //!< Profiling enabled for the wave limiter - uint32_t waves_; //!< The number of waves used in a dispatch - ProfilingCallback* callback_; uint64_t correlation_id_; - bool marker_ts_; //!< TS marker + bool enabled_; //!< Profiling enabled for the wave limiter + bool marker_ts_; //!< TS marker void clear() { queued_ = 0ULL; @@ -128,15 +125,6 @@ class Event : public RuntimeObject { start_ = 0ULL; end_ = 0ULL; } - void setCallback(ProfilingCallback* callback, uint32_t waves) { - if (callback == NULL) { - return; - } - enabled_ = true; - waves_ = waves; - clear(); - callback_ = callback; - } } profilingInfo_; //! Construct a new event. @@ -163,7 +151,6 @@ class Event : public RuntimeObject { void EnableProfiling() { profilingInfo_.enabled_ = true; profilingInfo_.clear(); - profilingInfo_.callback_ = nullptr; profilingInfo_.correlation_id_ = activity_prof::correlation_id; } diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index 777e7754dd..cf6917541d 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -118,8 +118,6 @@ release(uint, OCL_SET_SVM_SIZE, 4*16384, \ "set SVM space size for discrete GPU") \ release(uint, GPU_WAVES_PER_SIMD, 0, \ "Force the number of waves per SIMD (1-10)") \ -release(bool, GPU_WAVE_LIMIT_ENABLE, false, \ - "1 = Enable adaptive wave limiter") \ release(bool, OCL_STUB_PROGRAMS, false, \ "1 = Enables OCL programs stubing") \ release(bool, GPU_ANALYZE_HANG, false, \ @@ -128,16 +126,6 @@ release(uint, GPU_MAX_REMOTE_MEM_SIZE, 2, \ "Maximum size (in Ki) that allows device memory substitution with system") \ release(bool, GPU_ADD_HBCC_SIZE, false, \ "Add HBCC size to the reported device memory") \ -release_on_stg(uint, GPU_WAVE_LIMIT_CU_PER_SH, 0, \ - "Assume the number of CU per SH for wave limiter") \ -release_on_stg(uint, GPU_WAVE_LIMIT_MAX_WAVE, 10, \ - "Set maximum waves per SIMD to try for wave limiter") \ -release_on_stg(uint, GPU_WAVE_LIMIT_RUN, 20, \ - "Set running factor for wave limiter") \ -release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "", \ - "File path prefix for dumping wave limiter output") \ -release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "", \ - "File path prefix for tracing wave limiter") \ release(bool, PAL_DISABLE_SDMA, false, \ "1 = Disable SDMA for PAL") \ release(uint, PAL_RGP_DISP_COUNT, 10000, \