Files
rocm-systems/rocclr/runtime/device/devwavelimiter.cpp
T
foreman 6cc75de90f P4 to Git Change 1599699 by gandryey@gera-w8 on 2018/08/29 18:43:02
SWDEV-79445 - OCL generic changes and code clean-up
	- Move WaveLimiter logic to the abstract layer. PAL version was taken as the base, thus performance of GSL path can be affected by this change

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#315 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devwavelimiter.cpp#1 move/add
... //depot/stg/opencl/drivers/opencl/runtime/device/devwavelimiter.hpp#1 move/add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#598 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#331 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#133 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.cpp#15 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.hpp#11 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#107 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#64 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.cpp#8 move/delete
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.hpp#8 move/delete
2018-08-29 18:54:19 -04:00

331 γραμμές
11 KiB
C++

//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "platform/command.hpp"
#include "device/devkernel.hpp"
#include "device/devwavelimiter.hpp"
#include "os/os.hpp"
#include "utils/flags.hpp"
#include <cstdlib>
using namespace std;
namespace device {
uint WaveLimiter::MaxWave;
uint WaveLimiter::RunCount;
uint WaveLimiter::AdaptCount;
// ================================================================================================
WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump)
: manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
AdaptCount = MaxContinuousSamples * 2 * (MaxWave + 1);
state_ = WARMUP;
if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + ".txt");
}
waves_ = MaxWave;
enable_ = (SIMDPerSH_ == 0) ? false : enable;
bestWave_ = (enable_) ? MaxWave : 0;
worstWave_ = 0;
sampleCount_ = 0;
resultCount_ = 0;
numContinuousSamples_ = 0;
}
// ================================================================================================
WaveLimiter::~WaveLimiter() {
if (traceStream_.is_open()) {
traceStream_.close();
}
}
// ================================================================================================
uint WaveLimiter::getWavesPerSH() {
// Generate different wave counts in the adaptation mode
if ((state_ == ADAPT) && (sampleCount_ < AdaptCount)) {
if (numContinuousSamples_ == 0) {
++waves_;
waves_ %= MaxWave + 1;
// Don't execute the wave count with the worst performance
if (waves_ != 0) {
while (worstWave_ >= waves_) {
++waves_;
waves_ %= MaxWave + 1;
}
}
}
++numContinuousSamples_;
numContinuousSamples_ %= MaxContinuousSamples;
++sampleCount_;
}
else {
waves_ = bestWave_;
}
return waves_ * SIMDPerSH_;
}
// ================================================================================================
WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
bool enableDump)
: WaveLimiter(manager, seqNum, enable, enableDump) {
dynRunCount_ = RunCount;
adpMeasure_.resize(MaxWave + 1);
adpSampleCnt_.resize(MaxWave + 1);
runMeasure_.resize(MaxWave + 1);
runSampleCnt_.resize(MaxWave + 1);
clearData();
}
// ================================================================================================
WLAlgorithmSmooth::~WLAlgorithmSmooth() {}
// ================================================================================================
void WLAlgorithmSmooth::clearData() {
waves_ = MaxWave;
countAll_ = 0;
clear(adpMeasure_);
clear(adpSampleCnt_);
dataCount_ = 0;
}
// ================================================================================================
void WLAlgorithmSmooth::updateData(ulong time) {
}
// ================================================================================================
void WLAlgorithmSmooth::outputTrace() {
if (!traceStream_.is_open()) {
return;
}
traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ <<
" waves=" << waves_ << " bestWave=" << bestWave_ << " worstWave=" << worstWave_ << '\n';
output(traceStream_, "\n adaptive measure = ", adpMeasure_);
output(traceStream_, "\n adaptive smaple count = ", adpSampleCnt_);
output(traceStream_, "\n run measure = ", runMeasure_);
output(traceStream_, "\n run smaple count = ", runSampleCnt_);
traceStream_ << "\n % time from the previous runs to the best wave: ";
float min = static_cast<float>(adpMeasure_[bestWave_]) / adpSampleCnt_[bestWave_];
for (uint i = 0; i < (MaxWave + 1); ++i) {
runSampleCnt_[i] = (runSampleCnt_[i] == 0) ? 1 : runSampleCnt_[i];
float average = static_cast<float>(runMeasure_[i]) / runSampleCnt_[i];
traceStream_ << (average * 100 / min) << " ";
}
traceStream_ << "\n run count = " << dynRunCount_;
traceStream_ << "\n\n";
}
// ================================================================================================
void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) {
dumper_.addData(duration, waves, static_cast<char>(state_));
if (!enable_ || (duration == 0)) {
return;
}
countAll_++;
waves /= SIMDPerSH_;
// Collect the time for the current wave count
runMeasure_[waves] += duration;
runSampleCnt_[waves]++;
switch (state_) {
case ADAPT:
assert(duration > 0);
// Wave count 0 indicates the satrt of adaptation
if ((waves == 0) || (resultCount_ > 0)) {
// Scale time to us
adpMeasure_[waves] += duration;
adpSampleCnt_[waves]++;
resultCount_++;
// If the end of adaptation is reached, then analyze the results
if (resultCount_ == AdaptCount) {
// Reset the counters
resultCount_ = sampleCount_ = 0;
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::min();
uint32_t best = bestWave_;
// Check performance for the previous run if it's available
if (runSampleCnt_[bestWave_] > 0) {
min = static_cast<float>(runMeasure_[bestWave_]) / runSampleCnt_[bestWave_];
}
else if (adpSampleCnt_[MaxWave] > 0) {
min = static_cast<float>(adpMeasure_[MaxWave]) / adpSampleCnt_[MaxWave];
bestWave_ = MaxWave;
}
// Find the fastest average time
float reference = min;
for (uint i = MaxWave; i > 0; --i) {
float average;
if (adpSampleCnt_[i] > 0) {
average = static_cast<float>(adpMeasure_[i]) / adpSampleCnt_[i];
}
else {
average = 0.0f;
}
// More waves have 5% advantage over the lower number
if (average * 1.05f < min) {
min = average;
bestWave_ = i;
}
if (average > max) {
max = average;
worstWave_ = i;
}
}
// Check for 5% acceptance
if ((min * 1.05f > reference) || (bestWave_ == best)) {
bestWave_ = best;
// Increase the run time if the same wave count is the best
dynRunCount_ += RunCount;
dynRunCount_++;
}
else {
dynRunCount_ = RunCount;
}
// Find the middle between the best and the worst
if (worstWave_ < bestWave_) {
worstWave_ += ((bestWave_ - worstWave_) >> 1);
} else {
worstWave_ = 0;
}
state_ = RUN;
outputTrace();
// Start to collect the new data for the best wave
countAll_ = 0;
runMeasure_[bestWave_] = 0;
runSampleCnt_[bestWave_] = 0;
}
}
return;
case WARMUP:
case RUN:
if (countAll_ < dynRunCount_) {
return;
}
if (state_ == WARMUP) {
runSampleCnt_[bestWave_] = 0;
}
state_ = ADAPT;
clearData();
return;
}
}
// ================================================================================================
WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) {
enable_ = enable;
if (enable_) {
fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv";
}
}
// ================================================================================================
WaveLimiter::DataDumper::~DataDumper() {
if (!enable_) {
return;
}
std::ofstream OFS(fileName_);
for (size_t i = 0, e = time_.size(); i != e; ++i) {
OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' << static_cast<uint>(state_[i])
<< '\n';
}
OFS.close();
}
// ================================================================================================
void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
if (!enable_) {
return;
}
time_.push_back(time);
wavePerSIMD_.push_back(wave);
state_.push_back(state);
}
// ================================================================================================
WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH)
: owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, ((simdPerSH == 0) ? 1 : simdPerSH));
fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
}
// ================================================================================================
WaveLimiterManager::~WaveLimiterManager() {
for (auto& I : limiters_) {
delete I.second;
}
}
// ================================================================================================
const std::string& WaveLimiterManager::name() const { return owner_->name(); }
// ================================================================================================
uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const {
if (fixed_ > 0) {
return fixed_;
}
if (!enable_) {
return 0;
}
auto loc = limiters_.find(vdev);
if (loc == limiters_.end()) {
return 0;
}
assert(loc->second != nullptr);
return loc->second->getWavesPerSH();
}
amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
const device::VirtualDevice* vdev) {
assert(vdev != nullptr);
if (!enable_ && !enableDump_) {
return nullptr;
}
amd::ScopedLock SL(monitor_);
auto loc = limiters_.find(vdev);
if (loc != limiters_.end()) {
return loc->second;
}
auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_);
if (limiter == nullptr) {
enable_ = false;
return nullptr;
}
limiters_[vdev] = limiter;
return limiter;
}
// ================================================================================================
void WaveLimiterManager::enable(bool isSupported) {
if (fixed_ > 0) {
return;
}
// Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
// Disabled for SI due to bug #10817
if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
enable_ = GPU_WAVE_LIMIT_ENABLE;
} else if (isSupported) {
if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
enable_ = true;
} else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH();
}
}
}
} // namespace pal