1e90e581d6
SWDEV-142271 - Performance drop is observed in Ocean Surface Simulation of Compubenchcl in 17.50 when compared to 17.Q4.1 - Rewrite the adaptive mode for waveliimiter. Make sure the performance feedback corresponds to the right wave count. Add the new sampling logic to find the best number, based on average performance. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#295 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.cpp#14 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.hpp#10 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#15 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#71 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#39 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.hpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#80 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#88 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#282 edit
277 рядки
7.0 KiB
C++
277 рядки
7.0 KiB
C++
//
|
|
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#include "device/gpu/gpukernel.hpp"
|
|
#include "device/gpu/gpuwavelimiter.hpp"
|
|
#include "os/os.hpp"
|
|
#include "utils/flags.hpp"
|
|
|
|
#include <cstdlib>
|
|
using namespace std;
|
|
|
|
namespace gpu {
|
|
|
|
uint WaveLimiter::MaxWave;
|
|
uint WaveLimiter::WarmUpCount;
|
|
uint WaveLimiter::RunCount;
|
|
uint WLAlgorithmSmooth::AdaptCount;
|
|
uint WLAlgorithmSmooth::AbandonThresh;
|
|
uint WLAlgorithmSmooth::DscThresh;
|
|
|
|
WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump)
|
|
: manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
|
|
setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
|
|
MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
|
|
WarmUpCount = GPU_WAVE_LIMIT_WARMUP;
|
|
RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
|
|
|
|
state_ = WARMUP;
|
|
if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
|
|
traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + ".txt");
|
|
}
|
|
|
|
waves_ = MaxWave;
|
|
currWaves_ = MaxWave;
|
|
bestWave_ = MaxWave;
|
|
enable_ = enable;
|
|
}
|
|
|
|
WaveLimiter::~WaveLimiter() {
|
|
if (traceStream_.is_open()) {
|
|
traceStream_.close();
|
|
}
|
|
}
|
|
|
|
uint WaveLimiter::getWavesPerSH() {
|
|
currWaves_ = waves_;
|
|
return waves_ * SIMDPerSH_;
|
|
}
|
|
|
|
WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
|
|
bool enableDump)
|
|
: WaveLimiter(manager, seqNum, enable, enableDump) {
|
|
AdaptCount = 2 * MaxWave + 1;
|
|
AbandonThresh = GPU_WAVE_LIMIT_ABANDON;
|
|
DscThresh = GPU_WAVE_LIMIT_DSC_THRESH;
|
|
|
|
dynRunCount_ = RunCount;
|
|
measure_.resize(MaxWave + 1);
|
|
reference_.resize(MaxWave + 1);
|
|
trial_.resize(MaxWave + 1);
|
|
ratio_.resize(MaxWave + 1);
|
|
|
|
clearData();
|
|
}
|
|
|
|
WLAlgorithmSmooth::~WLAlgorithmSmooth() {}
|
|
|
|
void WLAlgorithmSmooth::clearData() {
|
|
waves_ = MaxWave;
|
|
countAll_ = 0;
|
|
clear(measure_);
|
|
clear(reference_);
|
|
clear(trial_);
|
|
clear(ratio_);
|
|
discontinuous_ = false;
|
|
dataCount_ = 0;
|
|
}
|
|
|
|
void WLAlgorithmSmooth::updateData(ulong time) {
|
|
auto count = dataCount_ - 1;
|
|
assert(count < 2 * MaxWave + 1);
|
|
assert(time > 0);
|
|
assert(currWaves_ == waves_);
|
|
if (count % 2 == 0) {
|
|
assert(waves_ == MaxWave);
|
|
auto pos = count / 2;
|
|
measure_[pos] = time;
|
|
if (pos > 0) {
|
|
auto wave = MaxWave + 1 - pos;
|
|
if (abs(static_cast<long>(measure_[pos - 1]) - static_cast<long>(measure_[pos])) * 100 /
|
|
measure_[pos] >
|
|
DscThresh) {
|
|
discontinuous_ = true;
|
|
}
|
|
reference_[wave] = (time + measure_[pos - 1]) / 2;
|
|
ratio_[wave] = trial_[wave] * 100 / reference_[wave];
|
|
if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) {
|
|
bestWave_ = wave;
|
|
}
|
|
}
|
|
} else {
|
|
assert(waves_ == MaxWave - count / 2);
|
|
trial_[waves_] = time;
|
|
}
|
|
outputTrace();
|
|
}
|
|
|
|
void WLAlgorithmSmooth::outputTrace() {
|
|
if (!traceStream_.is_open()) {
|
|
return;
|
|
}
|
|
|
|
traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
|
|
<< " currWaves=" << currWaves_ << " waves=" << waves_ << " bestWave=" << bestWave_
|
|
<< '\n';
|
|
output(traceStream_, "\n measure = ", measure_);
|
|
output(traceStream_, "\n reference = ", reference_);
|
|
output(traceStream_, "\n ratio = ", ratio_);
|
|
traceStream_ << "\n\n";
|
|
}
|
|
|
|
|
|
void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) {
|
|
dumper_.addData(duration, currWaves_, static_cast<char>(state_));
|
|
|
|
if (!enable_ || (duration == 0)) {
|
|
return;
|
|
}
|
|
|
|
countAll_++;
|
|
|
|
switch (state_) {
|
|
case WARMUP:
|
|
if (countAll_ < WarmUpCount) {
|
|
return;
|
|
}
|
|
state_ = ADAPT;
|
|
bestWave_ = MaxWave;
|
|
clearData();
|
|
return;
|
|
case ADAPT:
|
|
assert(duration > 0);
|
|
if (waves_ == currWaves_) {
|
|
dataCount_++;
|
|
updateData(duration);
|
|
waves_ = MaxWave + 1 - dataCount_ / 2;
|
|
if (dataCount_ == 1 || (dataCount_ < AdaptCount && !discontinuous_ &&
|
|
(dataCount_ % 2 == 0 || ratio_[waves_] < AbandonThresh))) {
|
|
if (dataCount_ % 2 == 1) {
|
|
--waves_;
|
|
} else {
|
|
waves_ = MaxWave;
|
|
}
|
|
return;
|
|
}
|
|
waves_ = bestWave_;
|
|
if (dataCount_ >= AdaptCount) {
|
|
dynRunCount_ = RunCount;
|
|
} else {
|
|
dynRunCount_ = AdaptCount;
|
|
}
|
|
countAll_ = rand() % MaxWave;
|
|
state_ = RUN;
|
|
}
|
|
return;
|
|
case RUN:
|
|
if (countAll_ < dynRunCount_) {
|
|
return;
|
|
}
|
|
state_ = ADAPT;
|
|
bestWave_ = MaxWave;
|
|
clearData();
|
|
return;
|
|
}
|
|
}
|
|
|
|
WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) {
|
|
enable_ = enable;
|
|
if (enable_) {
|
|
fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv";
|
|
}
|
|
}
|
|
|
|
WaveLimiter::DataDumper::~DataDumper() {
|
|
if (!enable_) {
|
|
return;
|
|
}
|
|
|
|
std::ofstream OFS(fileName_);
|
|
for (size_t i = 0, e = time_.size(); i != e; ++i) {
|
|
OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' << static_cast<uint>(state_[i])
|
|
<< '\n';
|
|
}
|
|
OFS.close();
|
|
}
|
|
|
|
void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
|
|
if (!enable_) {
|
|
return;
|
|
}
|
|
|
|
time_.push_back(time);
|
|
wavePerSIMD_.push_back(wave);
|
|
state_.push_back(state);
|
|
}
|
|
|
|
WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH)
|
|
: owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
|
|
setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH);
|
|
fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
|
|
}
|
|
|
|
WaveLimiterManager::~WaveLimiterManager() {
|
|
for (auto& I : limiters_) {
|
|
delete I.second;
|
|
}
|
|
}
|
|
|
|
uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const {
|
|
if (fixed_ > 0) {
|
|
return fixed_;
|
|
}
|
|
if (!enable_) {
|
|
return 0;
|
|
}
|
|
auto loc = limiters_.find(vdev);
|
|
if (loc == limiters_.end()) {
|
|
return 0;
|
|
}
|
|
assert(loc->second != NULL);
|
|
return loc->second->getWavesPerSH();
|
|
}
|
|
|
|
amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
|
|
const device::VirtualDevice* vdev) {
|
|
assert(vdev != NULL);
|
|
if (!enable_ && !enableDump_) {
|
|
return NULL;
|
|
}
|
|
|
|
amd::ScopedLock SL(monitor_);
|
|
auto loc = limiters_.find(vdev);
|
|
if (loc != limiters_.end()) {
|
|
return loc->second;
|
|
}
|
|
|
|
auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_);
|
|
if (limiter == NULL) {
|
|
enable_ = false;
|
|
return NULL;
|
|
}
|
|
limiters_[vdev] = limiter;
|
|
return limiter;
|
|
}
|
|
|
|
void WaveLimiterManager::enable(const bool isCiPlus) {
|
|
if (fixed_ > 0) {
|
|
return;
|
|
}
|
|
|
|
// Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
|
|
// Disabled for SI due to bug #10817
|
|
if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
|
|
enable_ = GPU_WAVE_LIMIT_ENABLE;
|
|
} else {
|
|
if (isCiPlus) {
|
|
if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
|
|
enable_ = true;
|
|
} else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
|
|
fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|