SWDEV-407533 - [ABI Break]Remove Wavelimiter

Change-Id: I6a2f6fb5a0c3acea93fa0200a69679783e76f5bd


[ROCm/clr commit: 7be3a5e33e]
Этот коммит содержится в:
German
2023-08-24 18:29:40 -04:00
коммит произвёл German Andryeyev
родитель 97d7b15428
Коммит 5d9912f48b
14 изменённых файлов: 10 добавлений и 584 удалений
-1
Просмотреть файл
@@ -82,7 +82,6 @@ class EventMarker : public amd::Marker {
int32_t scope = amd::Device::kCacheStateInvalid)
: amd::Marker(stream, disableFlush) {
profilingInfo_.enabled_ = true;
profilingInfo_.callback_ = nullptr;
profilingInfo_.marker_ts_ = markerTs;
profilingInfo_.clear();
setEventScope(scope);
-1
Просмотреть файл
@@ -70,7 +70,6 @@ target_sources(rocclr PRIVATE
${ROCCLR_SRC_DIR}/device/device.cpp
${ROCCLR_SRC_DIR}/device/devkernel.cpp
${ROCCLR_SRC_DIR}/device/devprogram.cpp
${ROCCLR_SRC_DIR}/device/devwavelimiter.cpp
${ROCCLR_SRC_DIR}/device/hsailctx.cpp
${ROCCLR_SRC_DIR}/elf/elf.cpp
${ROCCLR_SRC_DIR}/os/alloc.cpp
+1 -2
Просмотреть файл
@@ -589,8 +589,7 @@ Kernel::Kernel(const amd::Device& dev, const std::string& name, const Program& p
: dev_(dev)
, name_(name)
, prog_(prog)
, signature_(nullptr)
, waveLimiter_(this, dev.info().cuPerShaderArray_ * dev.info().simdPerCU_) {
, signature_(nullptr) {
// Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
// Due to std::string not being able to be memset to 0
workGroupInfo_.size_ = 0;
-12
Просмотреть файл
@@ -26,7 +26,6 @@
#include "platform/context.hpp"
#include "platform/object.hpp"
#include "platform/memory.hpp"
#include "devwavelimiter.hpp"
namespace amd {
class Device;
@@ -435,16 +434,6 @@ class Kernel : public amd::HeapObject {
size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; }
//! Get profiling callback object
amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) {
return waveLimiter_.getProfilingCallback(vdev);
};
//! Get waves per shader array to be used for kernel execution.
uint getWavesPerSH(const device::VirtualDevice* vdev) const {
return waveLimiter_.getWavesPerSH(vdev);
};
//! Returns GPU device object, associated with this kernel
const amd::Device& device() const { return dev_; }
@@ -567,7 +556,6 @@ class Kernel : public amd::HeapObject {
amd::KernelSignature* signature_; //!< kernel signature
std::string buildLog_; //!< build log
std::vector<PrintfInfo> printf_; //!< Format strings for GPU printf support
WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
std::string runtimeHandle_; //!< Runtime handle for context loader
uint64_t kernelCodeHandle_ = 0; //!< Kernel code handle (aka amd_kernel_code_t)
-1
Просмотреть файл
@@ -26,7 +26,6 @@
#include "platform/context.hpp"
#include "platform/object.hpp"
#include "platform/memory.hpp"
#include "devwavelimiter.hpp"
#if defined(USE_COMGR_LIBRARY)
#include "amd_comgr/amd_comgr.h"
-348
Просмотреть файл
@@ -1,348 +0,0 @@
/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "platform/command.hpp"
#include "device/devkernel.hpp"
#include "device/devwavelimiter.hpp"
#include "os/os.hpp"
#include "utils/flags.hpp"
#include <cstdlib>
using namespace std;
namespace device {
uint WaveLimiter::MaxWave;
uint WaveLimiter::RunCount;
uint WaveLimiter::AdaptCount;
// ================================================================================================
WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump)
: manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
AdaptCount = MaxContinuousSamples * 2 * (MaxWave + 1);
state_ = WARMUP;
if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + ".txt");
}
waves_ = MaxWave;
enable_ = (SIMDPerSH_ == 0) ? false : enable;
bestWave_ = (enable_) ? MaxWave : 0;
worstWave_ = 0;
sampleCount_ = 0;
resultCount_ = 0;
numContinuousSamples_ = 0;
}
// ================================================================================================
WaveLimiter::~WaveLimiter() {
if (traceStream_.is_open()) {
traceStream_.close();
}
}
// ================================================================================================
uint WaveLimiter::getWavesPerSH() {
// Generate different wave counts in the adaptation mode
if ((state_ == ADAPT) && (sampleCount_ < AdaptCount)) {
if (numContinuousSamples_ == 0) {
++waves_;
waves_ %= MaxWave + 1;
// Don't execute the wave count with the worst performance
if (waves_ != 0) {
while (worstWave_ >= waves_) {
++waves_;
waves_ %= MaxWave + 1;
}
}
}
++numContinuousSamples_;
numContinuousSamples_ %= MaxContinuousSamples;
++sampleCount_;
}
else {
waves_ = bestWave_;
}
return waves_ * SIMDPerSH_;
}
// ================================================================================================
WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
bool enableDump)
: WaveLimiter(manager, seqNum, enable, enableDump) {
dynRunCount_ = RunCount;
adpMeasure_.resize(MaxWave + 1);
adpSampleCnt_.resize(MaxWave + 1);
runMeasure_.resize(MaxWave + 1);
runSampleCnt_.resize(MaxWave + 1);
clearData();
}
// ================================================================================================
WLAlgorithmSmooth::~WLAlgorithmSmooth() {}
// ================================================================================================
void WLAlgorithmSmooth::clearData() {
waves_ = MaxWave;
countAll_ = 0;
clear(adpMeasure_);
clear(adpSampleCnt_);
dataCount_ = 0;
}
// ================================================================================================
void WLAlgorithmSmooth::updateData(ulong time) {
}
// ================================================================================================
void WLAlgorithmSmooth::outputTrace() {
if (!traceStream_.is_open()) {
return;
}
traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ <<
" waves=" << waves_ << " bestWave=" << bestWave_ << " worstWave=" << worstWave_ << '\n';
output(traceStream_, "\n adaptive measure = ", adpMeasure_);
output(traceStream_, "\n adaptive sample count = ", adpSampleCnt_);
output(traceStream_, "\n run measure = ", runMeasure_);
output(traceStream_, "\n run sample count = ", runSampleCnt_);
traceStream_ << "\n % time from the previous runs to the best wave: ";
float min = static_cast<float>(adpMeasure_[bestWave_]) / adpSampleCnt_[bestWave_];
for (uint i = 0; i < (MaxWave + 1); ++i) {
runSampleCnt_[i] = (runSampleCnt_[i] == 0) ? 1 : runSampleCnt_[i];
float average = static_cast<float>(runMeasure_[i]) / runSampleCnt_[i];
traceStream_ << (average * 100 / min) << " ";
}
traceStream_ << "\n run count = " << dynRunCount_;
traceStream_ << "\n\n";
}
// ================================================================================================
void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) {
dumper_.addData(duration, waves, static_cast<char>(state_));
if (!enable_ || (duration == 0)) {
return;
}
countAll_++;
waves /= SIMDPerSH_;
// Collect the time for the current wave count
runMeasure_[waves] += duration;
runSampleCnt_[waves]++;
switch (state_) {
case ADAPT:
assert(duration > 0);
// Wave count 0 indicates the satrt of adaptation
if ((waves == 0) || (resultCount_ > 0)) {
// Scale time to us
adpMeasure_[waves] += duration;
adpSampleCnt_[waves]++;
resultCount_++;
// If the end of adaptation is reached, then analyze the results
if (resultCount_ == AdaptCount) {
// Reset the counters
resultCount_ = sampleCount_ = 0;
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::min();
uint32_t best = bestWave_;
// Check performance for the previous run if it's available
if (runSampleCnt_[bestWave_] > 0) {
min = static_cast<float>(runMeasure_[bestWave_]) / runSampleCnt_[bestWave_];
}
else if (adpSampleCnt_[MaxWave] > 0) {
min = static_cast<float>(adpMeasure_[MaxWave]) / adpSampleCnt_[MaxWave];
bestWave_ = MaxWave;
}
// Find the fastest average time
float reference = min;
for (uint i = MaxWave; i > 0; --i) {
float average;
if (adpSampleCnt_[i] > 0) {
average = static_cast<float>(adpMeasure_[i]) / adpSampleCnt_[i];
}
else {
average = 0.0f;
}
// More waves have 5% advantage over the lower number
if (average * 1.05f < min) {
min = average;
bestWave_ = i;
}
if (average > max) {
max = average;
worstWave_ = i;
}
}
// Check for 5% acceptance
if ((min * 1.05f > reference) || (bestWave_ == best)) {
bestWave_ = best;
// Increase the run time if the same wave count is the best
dynRunCount_ += RunCount;
dynRunCount_++;
}
else {
dynRunCount_ = RunCount;
}
// Find the middle between the best and the worst
if (worstWave_ < bestWave_) {
worstWave_ += ((bestWave_ - worstWave_) >> 1);
} else {
worstWave_ = 0;
}
state_ = RUN;
outputTrace();
// Start to collect the new data for the best wave
countAll_ = 0;
runMeasure_[bestWave_] = 0;
runSampleCnt_[bestWave_] = 0;
}
}
return;
case WARMUP:
case RUN:
if (countAll_ < dynRunCount_) {
return;
}
if (state_ == WARMUP) {
runSampleCnt_[bestWave_] = 0;
}
state_ = ADAPT;
clearData();
return;
}
}
// ================================================================================================
WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) {
enable_ = enable;
if (enable_) {
fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv";
}
}
// ================================================================================================
WaveLimiter::DataDumper::~DataDumper() {
if (!enable_) {
return;
}
std::ofstream OFS(fileName_);
for (size_t i = 0, e = time_.size(); i != e; ++i) {
OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' << static_cast<uint>(state_[i])
<< '\n';
}
OFS.close();
}
// ================================================================================================
void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
if (!enable_) {
return;
}
time_.push_back(time);
wavePerSIMD_.push_back(wave);
state_.push_back(state);
}
// ================================================================================================
WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH)
: owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, ((simdPerSH == 0) ? 1 : simdPerSH));
fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
}
// ================================================================================================
WaveLimiterManager::~WaveLimiterManager() {
for (auto& I : limiters_) {
delete I.second;
}
}
// ================================================================================================
const std::string& WaveLimiterManager::name() const { return owner_->name(); }
// ================================================================================================
uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const {
if (fixed_ > 0) {
return fixed_;
}
if (!enable_) {
return 0;
}
auto loc = limiters_.find(vdev);
if (loc == limiters_.end()) {
return 0;
}
assert(loc->second != nullptr);
return loc->second->getWavesPerSH();
}
amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
const device::VirtualDevice* vdev) {
assert(vdev != nullptr);
if (!enable_ && !enableDump_) {
return nullptr;
}
amd::ScopedLock SL(monitor_);
auto loc = limiters_.find(vdev);
if (loc != limiters_.end()) {
return loc->second;
}
auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_);
if (limiter == nullptr) {
enable_ = false;
return nullptr;
}
limiters_[vdev] = limiter;
return limiter;
}
// ================================================================================================
void WaveLimiterManager::enable(bool isSupported) {
if (fixed_ > 0) {
enable_ = GPU_WAVE_LIMIT_ENABLE;
return;
}
// Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
// Disabled for SI due to bug #10817
if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
enable_ = GPU_WAVE_LIMIT_ENABLE;
} else if (isSupported) {
if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
enable_ = true;
} else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH();
}
}
}
} // namespace pal
-173
Просмотреть файл
@@ -1,173 +0,0 @@
/* Copyright (c) 2008 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#pragma once
#include "thread/thread.hpp"
#include <cstdio>
#include <cstdlib>
#include <cstdint>
#include <fstream>
#include <unordered_map>
namespace amd {
struct ProfilingCallback : public amd::HeapObject {
virtual void callback(ulong duration, uint32_t waves) = 0;
};
}
//! \namespace pal PAL Device Implementation
namespace device {
class WaveLimiterManager;
class Kernel;
// Adaptively limit the number of waves per SIMD based on kernel execution time
class WaveLimiter : public amd::ProfilingCallback {
public:
explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
virtual ~WaveLimiter();
//! Get waves per shader array to be used for kernel execution.
uint getWavesPerSH();
protected:
enum StateKind { WARMUP, ADAPT, RUN };
class DataDumper {
public:
explicit DataDumper(const std::string& kernelName, bool enable);
~DataDumper();
//! Record execution time, waves/simd and state of wave limiter.
void addData(ulong time, uint wave, char state);
//! Whether this data dumper is enabled.
bool enabled() const { return enable_; }
private:
bool enable_;
std::string fileName_;
std::vector<ulong> time_;
std::vector<uint> wavePerSIMD_;
std::vector<char> state_;
};
bool enable_;
uint SIMDPerSH_; // Number of SIMDs per SH
uint waves_; // Waves per SIMD to be set
uint bestWave_; // Optimal waves per SIMD
uint worstWave_; // Wave number with the worst performance
uint countAll_; // Number of kernel executions
StateKind state_;
WaveLimiterManager* manager_;
DataDumper dumper_;
std::ofstream traceStream_;
uint32_t sampleCount_; //!< The number of samples for adaptive mode
uint32_t resultCount_; //!< The number of results for adaptive mode
uint32_t numContinuousSamples_; //!< The number of samples with the same wave count
static uint MaxWave; // Maximum number of waves per SIMD
static uint RunCount; // Number of kernel executions for normal run
static uint AdaptCount; // Number of kernel executions for adapting
static constexpr uint MaxContinuousSamples = 2;
//! Call back from Event::recordProfilingInfo to get execution time.
virtual void callback(ulong duration, uint32_t waves) = 0;
//! Output trace of measurement/adaptation.
virtual void outputTrace() = 0;
template <class T> void clear(T& A) {
uint idx = 0;
for (auto& I : A) {
if (idx > worstWave_) {
I = 0;
}
++idx;
}
}
template <class T> void output(std::ofstream& ofs, const std::string& prompt, T& A) {
ofs << prompt;
for (auto& I : A) {
ofs << ' ' << static_cast<ulong>(I);
}
}
};
class WLAlgorithmSmooth : public WaveLimiter {
public:
explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
bool enableDump);
virtual ~WLAlgorithmSmooth();
private:
std::vector<uint64_t> adpMeasure_; //!< Accumulated performance in the adaptation mode
std::vector<uint32_t> adpSampleCnt_; //!< The number of samples in the adaptation mode
std::vector<uint64_t> runMeasure_; //!< Accumulated performance in the run mode
std::vector<uint32_t> runSampleCnt_; //!< The number of samples in the run mode
uint dynRunCount_;
uint dataCount_;
//! Update measurement data and optimal waves/simd with execution time.
void updateData(ulong time);
//! Clear measurement data for the next adaptation.
void clearData();
//! Call back from Event::recordProfilingInfo to get execution time.
void callback(ulong duration, uint32_t waves) override;
//! Output trace of measurement/adaptation.
void outputTrace() override;
};
// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
class WaveLimiterManager {
public:
explicit WaveLimiterManager(Kernel* owner, const uint simdPerSH);
virtual ~WaveLimiterManager();
//! Get waves per shader array for a specific virtual device.
uint getWavesPerSH(const VirtualDevice*) const;
//! Provide call back function for a specific virtual device.
amd::ProfilingCallback* getProfilingCallback(const VirtualDevice*);
//! Enable wave limiter manager by kernel metadata and flags.
void enable(bool isSupported = true);
//! Returns the kernel name
const std::string& name() const;
//! Get SimdPerSH.
uint getSimdPerSH() const { return simdPerSH_; }
private:
device::Kernel* owner_; // The kernel which owns this object
uint simdPerSH_; // Simd Per SH
std::unordered_map<const VirtualDevice*, WaveLimiter*>
limiters_; // Maps virtual device to wave limiter
bool enable_; // Whether the adaptation is enabled
bool enableDump_; // Whether the data dumper is enabled
uint fixed_; // The fixed waves/simd value if not zero
amd::Monitor monitor_; // The mutex for updating the wave limiter map
};
}
-2
Просмотреть файл
@@ -224,8 +224,6 @@ bool HSAILKernel::init() {
return false;
}
waveLimiter_.enable();
size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
error = amd::Hsail::QueryInfo(palNullDevice().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
-1
Просмотреть файл
@@ -30,7 +30,6 @@
#include "device/pal/palvirtual.hpp"
#include "amd_hsa_kernel_code.h"
#include "device/pal/palprintf.hpp"
#include "device/devwavelimiter.hpp"
#include "hsa.h"
namespace amd {
+6 -7
Просмотреть файл
@@ -2473,8 +2473,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
// Submit kernel to HW
if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
&vcmd.event(), vcmd.sharedMemBytes(),
vcmd.cooperativeGroups())) {
vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
@@ -2489,7 +2488,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
profilingBegin(vcmd);
// Submit kernel to HW
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(),
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
@@ -2499,9 +2498,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
}
// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
const_address parameters, bool nativeMem,
amd::Event* enqueueEvent, uint32_t sharedMemBytes,
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
const amd::Kernel& kernel, const_address parameters,
bool nativeMem, uint32_t sharedMemBytes,
bool cooperativeGroup) {
size_t newOffset[3] = {0, 0, 0};
size_t newGlobalSize[3] = {0, 0, 0};
@@ -2648,7 +2647,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
dispatchParam.wavesPerSh = 0;
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
#ifdef PAL_DEBUGGER
-1
Просмотреть файл
@@ -311,7 +311,6 @@ class VirtualGPU : public device::VirtualDevice {
const amd::Kernel& kernel, //!< Kernel for execution
const_address parameters, //!< Parameters for the kernel
bool nativeMem = true, //!< Native memory objects
amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command
uint32_t sharedMemBytes = 0, //!< Shared memory size
bool cooperativeGroups = false //!< TRUE if cooperative groups mode is required
);
-7
Просмотреть файл
@@ -101,10 +101,6 @@ uint64_t Event::recordProfilingInfo(int32_t status, uint64_t timeStamp) {
break;
default:
profilingInfo_.end_ = timeStamp;
if (profilingInfo_.callback_ != nullptr) {
profilingInfo_.callback_->callback(timeStamp - profilingInfo_.start_,
profilingInfo_.waves_);
}
break;
}
return timeStamp;
@@ -429,15 +425,12 @@ NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList
firstDevice_(firstDevice) {
auto& device = queue.device();
auto devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(device));
profilingInfo_.setCallback(devKernel->getProfilingCallback(
queue.vdev()), devKernel->getWavesPerSH(queue.vdev()));
if (cooperativeGroups()) {
setNumWorkgroups();
}
if (forceProfiling) {
profilingInfo_.enabled_ = true;
profilingInfo_.clear();
profilingInfo_.callback_ = nullptr;
profilingInfo_.marker_ts_ = true;
}
kernel_.retain();
+3 -16
Просмотреть файл
@@ -104,10 +104,9 @@ class Event : public RuntimeObject {
static const EventWaitList nullWaitList;
struct ProfilingInfo {
ProfilingInfo(bool enabled = false) : enabled_(enabled), waves_(0), marker_ts_(false) {
ProfilingInfo(bool enabled = false) : enabled_(enabled), marker_ts_(false) {
if (enabled) {
clear();
callback_ = nullptr;
correlation_id_ = activity_prof::correlation_id;
}
}
@@ -116,11 +115,9 @@ class Event : public RuntimeObject {
uint64_t submitted_;
uint64_t start_;
uint64_t end_;
bool enabled_; //!< Profiling enabled for the wave limiter
uint32_t waves_; //!< The number of waves used in a dispatch
ProfilingCallback* callback_;
uint64_t correlation_id_;
bool marker_ts_; //!< TS marker
bool enabled_; //!< Profiling enabled for the wave limiter
bool marker_ts_; //!< TS marker
void clear() {
queued_ = 0ULL;
@@ -128,15 +125,6 @@ class Event : public RuntimeObject {
start_ = 0ULL;
end_ = 0ULL;
}
void setCallback(ProfilingCallback* callback, uint32_t waves) {
if (callback == NULL) {
return;
}
enabled_ = true;
waves_ = waves;
clear();
callback_ = callback;
}
} profilingInfo_;
//! Construct a new event.
@@ -163,7 +151,6 @@ class Event : public RuntimeObject {
void EnableProfiling() {
profilingInfo_.enabled_ = true;
profilingInfo_.clear();
profilingInfo_.callback_ = nullptr;
profilingInfo_.correlation_id_ = activity_prof::correlation_id;
}
-12
Просмотреть файл
@@ -118,8 +118,6 @@ release(uint, OCL_SET_SVM_SIZE, 4*16384, \
"set SVM space size for discrete GPU") \
release(uint, GPU_WAVES_PER_SIMD, 0, \
"Force the number of waves per SIMD (1-10)") \
release(bool, GPU_WAVE_LIMIT_ENABLE, false, \
"1 = Enable adaptive wave limiter") \
release(bool, OCL_STUB_PROGRAMS, false, \
"1 = Enables OCL programs stubing") \
release(bool, GPU_ANALYZE_HANG, false, \
@@ -128,16 +126,6 @@ release(uint, GPU_MAX_REMOTE_MEM_SIZE, 2, \
"Maximum size (in Ki) that allows device memory substitution with system") \
release(bool, GPU_ADD_HBCC_SIZE, false, \
"Add HBCC size to the reported device memory") \
release_on_stg(uint, GPU_WAVE_LIMIT_CU_PER_SH, 0, \
"Assume the number of CU per SH for wave limiter") \
release_on_stg(uint, GPU_WAVE_LIMIT_MAX_WAVE, 10, \
"Set maximum waves per SIMD to try for wave limiter") \
release_on_stg(uint, GPU_WAVE_LIMIT_RUN, 20, \
"Set running factor for wave limiter") \
release_on_stg(cstring, GPU_WAVE_LIMIT_DUMP, "", \
"File path prefix for dumping wave limiter output") \
release_on_stg(cstring, GPU_WAVE_LIMIT_TRACE, "", \
"File path prefix for tracing wave limiter") \
release(bool, PAL_DISABLE_SDMA, false, \
"1 = Disable SDMA for PAL") \
release(uint, PAL_RGP_DISP_COUNT, 10000, \