SWDEV-407533 - [ABI Break]Remove Wavelimiter
Change-Id: I6a2f6fb5a0c3acea93fa0200a69679783e76f5bd
[ROCm/clr commit: 7be3a5e33e]
Этот коммит содержится в:
коммит произвёл
German Andryeyev
родитель
97d7b15428
Коммит
5d9912f48b
@@ -82,7 +82,6 @@ class EventMarker : public amd::Marker {
|
||||
int32_t scope = amd::Device::kCacheStateInvalid)
|
||||
: amd::Marker(stream, disableFlush) {
|
||||
profilingInfo_.enabled_ = true;
|
||||
profilingInfo_.callback_ = nullptr;
|
||||
profilingInfo_.marker_ts_ = markerTs;
|
||||
profilingInfo_.clear();
|
||||
setEventScope(scope);
|
||||
|
||||
@@ -70,7 +70,6 @@ target_sources(rocclr PRIVATE
|
||||
${ROCCLR_SRC_DIR}/device/device.cpp
|
||||
${ROCCLR_SRC_DIR}/device/devkernel.cpp
|
||||
${ROCCLR_SRC_DIR}/device/devprogram.cpp
|
||||
${ROCCLR_SRC_DIR}/device/devwavelimiter.cpp
|
||||
${ROCCLR_SRC_DIR}/device/hsailctx.cpp
|
||||
${ROCCLR_SRC_DIR}/elf/elf.cpp
|
||||
${ROCCLR_SRC_DIR}/os/alloc.cpp
|
||||
|
||||
@@ -589,8 +589,7 @@ Kernel::Kernel(const amd::Device& dev, const std::string& name, const Program& p
|
||||
: dev_(dev)
|
||||
, name_(name)
|
||||
, prog_(prog)
|
||||
, signature_(nullptr)
|
||||
, waveLimiter_(this, dev.info().cuPerShaderArray_ * dev.info().simdPerCU_) {
|
||||
, signature_(nullptr) {
|
||||
// Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
|
||||
// Due to std::string not being able to be memset to 0
|
||||
workGroupInfo_.size_ = 0;
|
||||
|
||||
@@ -26,7 +26,6 @@
|
||||
#include "platform/context.hpp"
|
||||
#include "platform/object.hpp"
|
||||
#include "platform/memory.hpp"
|
||||
#include "devwavelimiter.hpp"
|
||||
|
||||
namespace amd {
|
||||
class Device;
|
||||
@@ -435,16 +434,6 @@ class Kernel : public amd::HeapObject {
|
||||
|
||||
size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; }
|
||||
|
||||
//! Get profiling callback object
|
||||
amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) {
|
||||
return waveLimiter_.getProfilingCallback(vdev);
|
||||
};
|
||||
|
||||
//! Get waves per shader array to be used for kernel execution.
|
||||
uint getWavesPerSH(const device::VirtualDevice* vdev) const {
|
||||
return waveLimiter_.getWavesPerSH(vdev);
|
||||
};
|
||||
|
||||
//! Returns GPU device object, associated with this kernel
|
||||
const amd::Device& device() const { return dev_; }
|
||||
|
||||
@@ -567,7 +556,6 @@ class Kernel : public amd::HeapObject {
|
||||
amd::KernelSignature* signature_; //!< kernel signature
|
||||
std::string buildLog_; //!< build log
|
||||
std::vector<PrintfInfo> printf_; //!< Format strings for GPU printf support
|
||||
WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
|
||||
std::string runtimeHandle_; //!< Runtime handle for context loader
|
||||
|
||||
uint64_t kernelCodeHandle_ = 0; //!< Kernel code handle (aka amd_kernel_code_t)
|
||||
|
||||
@@ -26,7 +26,6 @@
|
||||
#include "platform/context.hpp"
|
||||
#include "platform/object.hpp"
|
||||
#include "platform/memory.hpp"
|
||||
#include "devwavelimiter.hpp"
|
||||
|
||||
#if defined(USE_COMGR_LIBRARY)
|
||||
#include "amd_comgr/amd_comgr.h"
|
||||
|
||||
@@ -1,348 +0,0 @@
|
||||
/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE. */
|
||||
|
||||
#include "platform/command.hpp"
|
||||
#include "device/devkernel.hpp"
|
||||
#include "device/devwavelimiter.hpp"
|
||||
#include "os/os.hpp"
|
||||
#include "utils/flags.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
using namespace std;
|
||||
|
||||
namespace device {
|
||||
|
||||
uint WaveLimiter::MaxWave;
|
||||
uint WaveLimiter::RunCount;
|
||||
uint WaveLimiter::AdaptCount;
|
||||
|
||||
// ================================================================================================
|
||||
WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump)
|
||||
: manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
|
||||
setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
|
||||
MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
|
||||
RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
|
||||
AdaptCount = MaxContinuousSamples * 2 * (MaxWave + 1);
|
||||
|
||||
state_ = WARMUP;
|
||||
if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
|
||||
traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + ".txt");
|
||||
}
|
||||
|
||||
waves_ = MaxWave;
|
||||
enable_ = (SIMDPerSH_ == 0) ? false : enable;
|
||||
bestWave_ = (enable_) ? MaxWave : 0;
|
||||
worstWave_ = 0;
|
||||
sampleCount_ = 0;
|
||||
resultCount_ = 0;
|
||||
numContinuousSamples_ = 0;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
WaveLimiter::~WaveLimiter() {
|
||||
if (traceStream_.is_open()) {
|
||||
traceStream_.close();
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
uint WaveLimiter::getWavesPerSH() {
|
||||
// Generate different wave counts in the adaptation mode
|
||||
if ((state_ == ADAPT) && (sampleCount_ < AdaptCount)) {
|
||||
if (numContinuousSamples_ == 0) {
|
||||
++waves_;
|
||||
waves_ %= MaxWave + 1;
|
||||
// Don't execute the wave count with the worst performance
|
||||
if (waves_ != 0) {
|
||||
while (worstWave_ >= waves_) {
|
||||
++waves_;
|
||||
waves_ %= MaxWave + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
++numContinuousSamples_;
|
||||
numContinuousSamples_ %= MaxContinuousSamples;
|
||||
++sampleCount_;
|
||||
}
|
||||
else {
|
||||
waves_ = bestWave_;
|
||||
}
|
||||
return waves_ * SIMDPerSH_;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
|
||||
bool enableDump)
|
||||
: WaveLimiter(manager, seqNum, enable, enableDump) {
|
||||
dynRunCount_ = RunCount;
|
||||
adpMeasure_.resize(MaxWave + 1);
|
||||
adpSampleCnt_.resize(MaxWave + 1);
|
||||
runMeasure_.resize(MaxWave + 1);
|
||||
runSampleCnt_.resize(MaxWave + 1);
|
||||
|
||||
clearData();
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
WLAlgorithmSmooth::~WLAlgorithmSmooth() {}
|
||||
|
||||
// ================================================================================================
|
||||
void WLAlgorithmSmooth::clearData() {
|
||||
waves_ = MaxWave;
|
||||
countAll_ = 0;
|
||||
clear(adpMeasure_);
|
||||
clear(adpSampleCnt_);
|
||||
dataCount_ = 0;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void WLAlgorithmSmooth::updateData(ulong time) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void WLAlgorithmSmooth::outputTrace() {
|
||||
if (!traceStream_.is_open()) {
|
||||
return;
|
||||
}
|
||||
|
||||
traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ <<
|
||||
" waves=" << waves_ << " bestWave=" << bestWave_ << " worstWave=" << worstWave_ << '\n';
|
||||
output(traceStream_, "\n adaptive measure = ", adpMeasure_);
|
||||
output(traceStream_, "\n adaptive sample count = ", adpSampleCnt_);
|
||||
output(traceStream_, "\n run measure = ", runMeasure_);
|
||||
output(traceStream_, "\n run sample count = ", runSampleCnt_);
|
||||
traceStream_ << "\n % time from the previous runs to the best wave: ";
|
||||
float min = static_cast<float>(adpMeasure_[bestWave_]) / adpSampleCnt_[bestWave_];
|
||||
for (uint i = 0; i < (MaxWave + 1); ++i) {
|
||||
runSampleCnt_[i] = (runSampleCnt_[i] == 0) ? 1 : runSampleCnt_[i];
|
||||
float average = static_cast<float>(runMeasure_[i]) / runSampleCnt_[i];
|
||||
traceStream_ << (average * 100 / min) << " ";
|
||||
}
|
||||
traceStream_ << "\n run count = " << dynRunCount_;
|
||||
traceStream_ << "\n\n";
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) {
|
||||
dumper_.addData(duration, waves, static_cast<char>(state_));
|
||||
|
||||
if (!enable_ || (duration == 0)) {
|
||||
return;
|
||||
}
|
||||
|
||||
countAll_++;
|
||||
|
||||
waves /= SIMDPerSH_;
|
||||
// Collect the time for the current wave count
|
||||
runMeasure_[waves] += duration;
|
||||
runSampleCnt_[waves]++;
|
||||
|
||||
switch (state_) {
|
||||
case ADAPT:
|
||||
assert(duration > 0);
|
||||
// Wave count 0 indicates the satrt of adaptation
|
||||
if ((waves == 0) || (resultCount_ > 0)) {
|
||||
// Scale time to us
|
||||
adpMeasure_[waves] += duration;
|
||||
adpSampleCnt_[waves]++;
|
||||
resultCount_++;
|
||||
// If the end of adaptation is reached, then analyze the results
|
||||
if (resultCount_ == AdaptCount) {
|
||||
// Reset the counters
|
||||
resultCount_ = sampleCount_ = 0;
|
||||
float min = std::numeric_limits<float>::max();
|
||||
float max = std::numeric_limits<float>::min();
|
||||
uint32_t best = bestWave_;
|
||||
// Check performance for the previous run if it's available
|
||||
if (runSampleCnt_[bestWave_] > 0) {
|
||||
min = static_cast<float>(runMeasure_[bestWave_]) / runSampleCnt_[bestWave_];
|
||||
}
|
||||
else if (adpSampleCnt_[MaxWave] > 0) {
|
||||
min = static_cast<float>(adpMeasure_[MaxWave]) / adpSampleCnt_[MaxWave];
|
||||
bestWave_ = MaxWave;
|
||||
}
|
||||
// Find the fastest average time
|
||||
float reference = min;
|
||||
for (uint i = MaxWave; i > 0; --i) {
|
||||
float average;
|
||||
if (adpSampleCnt_[i] > 0) {
|
||||
average = static_cast<float>(adpMeasure_[i]) / adpSampleCnt_[i];
|
||||
}
|
||||
else {
|
||||
average = 0.0f;
|
||||
}
|
||||
// More waves have 5% advantage over the lower number
|
||||
if (average * 1.05f < min) {
|
||||
min = average;
|
||||
bestWave_ = i;
|
||||
}
|
||||
if (average > max) {
|
||||
max = average;
|
||||
worstWave_ = i;
|
||||
}
|
||||
}
|
||||
// Check for 5% acceptance
|
||||
if ((min * 1.05f > reference) || (bestWave_ == best)) {
|
||||
bestWave_ = best;
|
||||
// Increase the run time if the same wave count is the best
|
||||
dynRunCount_ += RunCount;
|
||||
dynRunCount_++;
|
||||
}
|
||||
else {
|
||||
dynRunCount_ = RunCount;
|
||||
}
|
||||
// Find the middle between the best and the worst
|
||||
if (worstWave_ < bestWave_) {
|
||||
worstWave_ += ((bestWave_ - worstWave_) >> 1);
|
||||
} else {
|
||||
worstWave_ = 0;
|
||||
}
|
||||
state_ = RUN;
|
||||
outputTrace();
|
||||
// Start to collect the new data for the best wave
|
||||
countAll_ = 0;
|
||||
runMeasure_[bestWave_] = 0;
|
||||
runSampleCnt_[bestWave_] = 0;
|
||||
}
|
||||
}
|
||||
return;
|
||||
case WARMUP:
|
||||
case RUN:
|
||||
if (countAll_ < dynRunCount_) {
|
||||
return;
|
||||
}
|
||||
if (state_ == WARMUP) {
|
||||
runSampleCnt_[bestWave_] = 0;
|
||||
}
|
||||
state_ = ADAPT;
|
||||
clearData();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) {
|
||||
enable_ = enable;
|
||||
if (enable_) {
|
||||
fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv";
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
WaveLimiter::DataDumper::~DataDumper() {
|
||||
if (!enable_) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::ofstream OFS(fileName_);
|
||||
for (size_t i = 0, e = time_.size(); i != e; ++i) {
|
||||
OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' << static_cast<uint>(state_[i])
|
||||
<< '\n';
|
||||
}
|
||||
OFS.close();
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
|
||||
if (!enable_) {
|
||||
return;
|
||||
}
|
||||
|
||||
time_.push_back(time);
|
||||
wavePerSIMD_.push_back(wave);
|
||||
state_.push_back(state);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH)
|
||||
: owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
|
||||
setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, ((simdPerSH == 0) ? 1 : simdPerSH));
|
||||
fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
WaveLimiterManager::~WaveLimiterManager() {
|
||||
for (auto& I : limiters_) {
|
||||
delete I.second;
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
const std::string& WaveLimiterManager::name() const { return owner_->name(); }
|
||||
|
||||
// ================================================================================================
|
||||
uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const {
|
||||
if (fixed_ > 0) {
|
||||
return fixed_;
|
||||
}
|
||||
if (!enable_) {
|
||||
return 0;
|
||||
}
|
||||
auto loc = limiters_.find(vdev);
|
||||
if (loc == limiters_.end()) {
|
||||
return 0;
|
||||
}
|
||||
assert(loc->second != nullptr);
|
||||
return loc->second->getWavesPerSH();
|
||||
}
|
||||
|
||||
amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
|
||||
const device::VirtualDevice* vdev) {
|
||||
assert(vdev != nullptr);
|
||||
if (!enable_ && !enableDump_) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
amd::ScopedLock SL(monitor_);
|
||||
auto loc = limiters_.find(vdev);
|
||||
if (loc != limiters_.end()) {
|
||||
return loc->second;
|
||||
}
|
||||
|
||||
auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_);
|
||||
if (limiter == nullptr) {
|
||||
enable_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
limiters_[vdev] = limiter;
|
||||
return limiter;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void WaveLimiterManager::enable(bool isSupported) {
|
||||
if (fixed_ > 0) {
|
||||
enable_ = GPU_WAVE_LIMIT_ENABLE;
|
||||
return;
|
||||
}
|
||||
|
||||
// Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
|
||||
// Disabled for SI due to bug #10817
|
||||
if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
|
||||
enable_ = GPU_WAVE_LIMIT_ENABLE;
|
||||
} else if (isSupported) {
|
||||
if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
|
||||
enable_ = true;
|
||||
} else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
|
||||
fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
@@ -1,173 +0,0 @@
|
||||
/* Copyright (c) 2008 - 2021 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "thread/thread.hpp"
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace amd {
|
||||
struct ProfilingCallback : public amd::HeapObject {
|
||||
virtual void callback(ulong duration, uint32_t waves) = 0;
|
||||
};
|
||||
}
|
||||
|
||||
//! \namespace pal PAL Device Implementation
|
||||
namespace device {
|
||||
|
||||
class WaveLimiterManager;
|
||||
class Kernel;
|
||||
|
||||
// Adaptively limit the number of waves per SIMD based on kernel execution time
|
||||
class WaveLimiter : public amd::ProfilingCallback {
|
||||
public:
|
||||
explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
|
||||
virtual ~WaveLimiter();
|
||||
|
||||
//! Get waves per shader array to be used for kernel execution.
|
||||
uint getWavesPerSH();
|
||||
|
||||
protected:
|
||||
enum StateKind { WARMUP, ADAPT, RUN };
|
||||
|
||||
class DataDumper {
|
||||
public:
|
||||
explicit DataDumper(const std::string& kernelName, bool enable);
|
||||
~DataDumper();
|
||||
|
||||
//! Record execution time, waves/simd and state of wave limiter.
|
||||
void addData(ulong time, uint wave, char state);
|
||||
|
||||
//! Whether this data dumper is enabled.
|
||||
bool enabled() const { return enable_; }
|
||||
|
||||
private:
|
||||
bool enable_;
|
||||
std::string fileName_;
|
||||
std::vector<ulong> time_;
|
||||
std::vector<uint> wavePerSIMD_;
|
||||
std::vector<char> state_;
|
||||
};
|
||||
|
||||
bool enable_;
|
||||
uint SIMDPerSH_; // Number of SIMDs per SH
|
||||
uint waves_; // Waves per SIMD to be set
|
||||
uint bestWave_; // Optimal waves per SIMD
|
||||
uint worstWave_; // Wave number with the worst performance
|
||||
uint countAll_; // Number of kernel executions
|
||||
StateKind state_;
|
||||
WaveLimiterManager* manager_;
|
||||
DataDumper dumper_;
|
||||
std::ofstream traceStream_;
|
||||
uint32_t sampleCount_; //!< The number of samples for adaptive mode
|
||||
uint32_t resultCount_; //!< The number of results for adaptive mode
|
||||
uint32_t numContinuousSamples_; //!< The number of samples with the same wave count
|
||||
|
||||
static uint MaxWave; // Maximum number of waves per SIMD
|
||||
static uint RunCount; // Number of kernel executions for normal run
|
||||
static uint AdaptCount; // Number of kernel executions for adapting
|
||||
static constexpr uint MaxContinuousSamples = 2;
|
||||
|
||||
//! Call back from Event::recordProfilingInfo to get execution time.
|
||||
virtual void callback(ulong duration, uint32_t waves) = 0;
|
||||
|
||||
//! Output trace of measurement/adaptation.
|
||||
virtual void outputTrace() = 0;
|
||||
|
||||
template <class T> void clear(T& A) {
|
||||
uint idx = 0;
|
||||
for (auto& I : A) {
|
||||
if (idx > worstWave_) {
|
||||
I = 0;
|
||||
}
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
template <class T> void output(std::ofstream& ofs, const std::string& prompt, T& A) {
|
||||
ofs << prompt;
|
||||
for (auto& I : A) {
|
||||
ofs << ' ' << static_cast<ulong>(I);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class WLAlgorithmSmooth : public WaveLimiter {
|
||||
public:
|
||||
explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
|
||||
bool enableDump);
|
||||
virtual ~WLAlgorithmSmooth();
|
||||
|
||||
private:
|
||||
std::vector<uint64_t> adpMeasure_; //!< Accumulated performance in the adaptation mode
|
||||
std::vector<uint32_t> adpSampleCnt_; //!< The number of samples in the adaptation mode
|
||||
std::vector<uint64_t> runMeasure_; //!< Accumulated performance in the run mode
|
||||
std::vector<uint32_t> runSampleCnt_; //!< The number of samples in the run mode
|
||||
uint dynRunCount_;
|
||||
uint dataCount_;
|
||||
|
||||
//! Update measurement data and optimal waves/simd with execution time.
|
||||
void updateData(ulong time);
|
||||
|
||||
//! Clear measurement data for the next adaptation.
|
||||
void clearData();
|
||||
|
||||
//! Call back from Event::recordProfilingInfo to get execution time.
|
||||
void callback(ulong duration, uint32_t waves) override;
|
||||
|
||||
//! Output trace of measurement/adaptation.
|
||||
void outputTrace() override;
|
||||
};
|
||||
|
||||
// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
|
||||
class WaveLimiterManager {
|
||||
public:
|
||||
explicit WaveLimiterManager(Kernel* owner, const uint simdPerSH);
|
||||
virtual ~WaveLimiterManager();
|
||||
|
||||
//! Get waves per shader array for a specific virtual device.
|
||||
uint getWavesPerSH(const VirtualDevice*) const;
|
||||
|
||||
//! Provide call back function for a specific virtual device.
|
||||
amd::ProfilingCallback* getProfilingCallback(const VirtualDevice*);
|
||||
|
||||
//! Enable wave limiter manager by kernel metadata and flags.
|
||||
void enable(bool isSupported = true);
|
||||
|
||||
//! Returns the kernel name
|
||||
const std::string& name() const;
|
||||
|
||||
//! Get SimdPerSH.
|
||||
uint getSimdPerSH() const { return simdPerSH_; }
|
||||
|
||||
private:
|
||||
device::Kernel* owner_; // The kernel which owns this object
|
||||
uint simdPerSH_; // Simd Per SH
|
||||
std::unordered_map<const VirtualDevice*, WaveLimiter*>
|
||||
limiters_; // Maps virtual device to wave limiter
|
||||
bool enable_; // Whether the adaptation is enabled
|
||||
bool enableDump_; // Whether the data dumper is enabled
|
||||
uint fixed_; // The fixed waves/simd value if not zero
|
||||
amd::Monitor monitor_; // The mutex for updating the wave limiter map
|
||||
};
|
||||
}
|
||||
@@ -224,8 +224,6 @@ bool HSAILKernel::init() {
|
||||
return false;
|
||||
}
|
||||
|
||||
waveLimiter_.enable();
|
||||
|
||||
size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
|
||||
error = amd::Hsail::QueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
|
||||
openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
|
||||
|
||||
@@ -30,7 +30,6 @@
|
||||
#include "device/pal/palvirtual.hpp"
|
||||
#include "amd_hsa_kernel_code.h"
|
||||
#include "device/pal/palprintf.hpp"
|
||||
#include "device/devwavelimiter.hpp"
|
||||
#include "hsa.h"
|
||||
|
||||
namespace amd {
|
||||
|
||||
@@ -2473,8 +2473,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
|
||||
// Submit kernel to HW
|
||||
if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
|
||||
&vcmd.event(), vcmd.sharedMemBytes(),
|
||||
vcmd.cooperativeGroups())) {
|
||||
vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
|
||||
vcmd.setStatus(CL_INVALID_OPERATION);
|
||||
}
|
||||
|
||||
@@ -2489,7 +2488,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
profilingBegin(vcmd);
|
||||
|
||||
// Submit kernel to HW
|
||||
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(),
|
||||
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
|
||||
vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
|
||||
vcmd.setStatus(CL_INVALID_OPERATION);
|
||||
}
|
||||
@@ -2499,9 +2498,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
|
||||
const_address parameters, bool nativeMem,
|
||||
amd::Event* enqueueEvent, uint32_t sharedMemBytes,
|
||||
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
|
||||
const amd::Kernel& kernel, const_address parameters,
|
||||
bool nativeMem, uint32_t sharedMemBytes,
|
||||
bool cooperativeGroup) {
|
||||
size_t newOffset[3] = {0, 0, 0};
|
||||
size_t newGlobalSize[3] = {0, 0, 0};
|
||||
@@ -2648,7 +2647,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
}
|
||||
dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
|
||||
dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
|
||||
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
|
||||
dispatchParam.wavesPerSh = 0;
|
||||
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
|
||||
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
|
||||
#ifdef PAL_DEBUGGER
|
||||
|
||||
@@ -311,7 +311,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
const amd::Kernel& kernel, //!< Kernel for execution
|
||||
const_address parameters, //!< Parameters for the kernel
|
||||
bool nativeMem = true, //!< Native memory objects
|
||||
amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command
|
||||
uint32_t sharedMemBytes = 0, //!< Shared memory size
|
||||
bool cooperativeGroups = false //!< TRUE if cooperative groups mode is required
|
||||
);
|
||||
|
||||
@@ -101,10 +101,6 @@ uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) {
|
||||
break;
|
||||
default:
|
||||
profilingInfo_.end_ = timeStamp;
|
||||
if (profilingInfo_.callback_ != nullptr) {
|
||||
profilingInfo_.callback_->callback(timeStamp - profilingInfo_.start_,
|
||||
profilingInfo_.waves_);
|
||||
}
|
||||
break;
|
||||
}
|
||||
return timeStamp;
|
||||
@@ -429,15 +425,12 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList
|
||||
firstDevice_(firstDevice) {
|
||||
auto& device = queue.device();
|
||||
auto devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(device));
|
||||
profilingInfo_.setCallback(devKernel->getProfilingCallback(
|
||||
queue.vdev()), devKernel->getWavesPerSH(queue.vdev()));
|
||||
if (cooperativeGroups()) {
|
||||
setNumWorkgroups();
|
||||
}
|
||||
if (forceProfiling) {
|
||||
profilingInfo_.enabled_ = true;
|
||||
profilingInfo_.clear();
|
||||
profilingInfo_.callback_ = nullptr;
|
||||
profilingInfo_.marker_ts_ = true;
|
||||
}
|
||||
kernel_.retain();
|
||||
|
||||
@@ -104,10 +104,9 @@ class Event : public RuntimeObject {
|
||||
static const EventWaitList nullWaitList;
|
||||
|
||||
struct ProfilingInfo {
|
||||
ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(false) {
|
||||
ProfilingInfo(bool enabled = false) : enabled_(enabled), marker_ts_(false) {
|
||||
if (enabled) {
|
||||
clear();
|
||||
callback_ = nullptr;
|
||||
correlation_id_ = activity_prof::correlation_id;
|
||||
}
|
||||
}
|
||||
@@ -116,11 +115,9 @@ class Event : public RuntimeObject {
|
||||
uint64_t submitted_;
|
||||
uint64_t start_;
|
||||
uint64_t end_;
|
||||
bool enabled_; //!< Profiling enabled for the wave limiter
|
||||
uint32_t waves_; //!< The number of waves used in a dispatch
|
||||
ProfilingCallback* callback_;
|
||||
uint64_t correlation_id_;
|
||||
bool marker_ts_; //!< TS marker
|
||||
bool enabled_; //!< Profiling enabled for the wave limiter
|
||||
bool marker_ts_; //!< TS marker
|
||||
|
||||
void clear() {
|
||||
queued_ = 0ULL;
|
||||
@@ -128,15 +125,6 @@ class Event : public RuntimeObject {
|
||||
start_ = 0ULL;
|
||||
end_ = 0ULL;
|
||||
}
|
||||
void setCallback(ProfilingCallback* callback, uint32_t waves) {
|
||||
if (callback == NULL) {
|
||||
return;
|
||||
}
|
||||
enabled_ = true;
|
||||
waves_ = waves;
|
||||
clear();
|
||||
callback_ = callback;
|
||||
}
|
||||
} profilingInfo_;
|
||||
|
||||
//! Construct a new event.
|
||||
@@ -163,7 +151,6 @@ class Event : public RuntimeObject {
|
||||
void EnableProfiling() {
|
||||
profilingInfo_.enabled_ = true;
|
||||
profilingInfo_.clear();
|
||||
profilingInfo_.callback_ = nullptr;
|
||||
profilingInfo_.correlation_id_ = activity_prof::correlation_id;
|
||||
}
|
||||
|
||||
|
||||
@@ -118,8 +118,6 @@ release(uint, OCL_SET_SVM_SIZE, 4*16384, \
|
||||
"set SVM space size for discrete GPU") \
|
||||
release(uint, GPU_WAVES_PER_SIMD, 0, \
|
||||
"Force the number of waves per SIMD (1-10)") \
|
||||
release(bool, GPU_WAVE_LIMIT_ENABLE, false, \
|
||||
"1 = Enable adaptive wave limiter") \
|
||||
release(bool, OCL_STUB_PROGRAMS, false, \
|
||||
"1 = Enables OCL programs stubing") \
|
||||
release(bool, GPU_ANALYZE_HANG, false, \
|
||||
@@ -128,16 +126,6 @@ release(uint, GPU_MAX_REMOTE_MEM_SIZE, 2, \
|
||||
"Maximum size (in Ki) that allows device memory substitution with system") \
|
||||
release(bool, GPU_ADD_HBCC_SIZE, false, \
|
||||
"Add HBCC size to the reported device memory") \
|
||||
release_on_stg(uint, GPU_WAVE_LIMIT_CU_PER_SH, 0, \
|
||||
"Assume the number of CU per SH for wave limiter") \
|
||||
release_on_stg(uint, GPU_WAVE_LIMIT_MAX_WAVE, 10, \
|
||||
"Set maximum waves per SIMD to try for wave limiter") \
|
||||
release_on_stg(uint, GPU_WAVE_LIMIT_RUN, 20, \
|
||||
"Set running factor for wave limiter") \
|
||||
release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "", \
|
||||
"File path prefix for dumping wave limiter output") \
|
||||
release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "", \
|
||||
"File path prefix for tracing wave limiter") \
|
||||
release(bool, PAL_DISABLE_SDMA, false, \
|
||||
"1 = Disable SDMA for PAL") \
|
||||
release(uint, PAL_RGP_DISP_COUNT, 10000, \
|
||||
|
||||
Ссылка в новой задаче
Block a user