From 6cc75de90f551df25b5752b1e0d75006583df026 Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 29 Aug 2018 18:54:19 -0400
Subject: [PATCH] P4 to Git Change 1599699 by gandryey@gera-w8 on 2018/08/29
18:43:02
SWDEV-79445 - OCL generic changes and code clean-up
- Move WaveLimiter logic to the abstract layer. PAL version was taken as the base, thus performance of GSL path can be affected by this change
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#315 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devwavelimiter.cpp#1 move/add
... //depot/stg/opencl/drivers/opencl/runtime/device/devwavelimiter.hpp#1 move/add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#598 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#331 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#133 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.cpp#15 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuwavelimiter.hpp#11 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#107 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#64 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.cpp#8 move/delete
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.hpp#8 move/delete
---
rocclr/runtime/device/device.hpp | 1 +
rocclr/runtime/device/devkernel.cpp | 38 ++-
rocclr/runtime/device/devkernel.hpp | 53 +---
.../palwavelimiter.cpp => devwavelimiter.cpp} | 34 ++-
.../palwavelimiter.hpp => devwavelimiter.hpp} | 23 +-
rocclr/runtime/device/gpu/gpudevice.cpp | 1 +
rocclr/runtime/device/gpu/gpukernel.cpp | 8 +-
rocclr/runtime/device/gpu/gpukernel.hpp | 21 +-
rocclr/runtime/device/gpu/gpuwavelimiter.cpp | 276 ------------------
rocclr/runtime/device/gpu/gpuwavelimiter.hpp | 151 ----------
rocclr/runtime/device/pal/paldevice.cpp | 1 +
rocclr/runtime/device/pal/palkernel.cpp | 7 +-
rocclr/runtime/device/pal/palkernel.hpp | 14 +-
13 files changed, 95 insertions(+), 533 deletions(-)
rename rocclr/runtime/device/{pal/palwavelimiter.cpp => devwavelimiter.cpp} (80%)
rename rocclr/runtime/device/{pal/palwavelimiter.hpp => devwavelimiter.hpp} (90%)
delete mode 100644 rocclr/runtime/device/gpu/gpuwavelimiter.cpp
delete mode 100644 rocclr/runtime/device/gpu/gpuwavelimiter.hpp
diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp
index ec89e63f5b..13b7ac451b 100644
--- a/rocclr/runtime/device/device.hpp
+++ b/rocclr/runtime/device/device.hpp
@@ -425,6 +425,7 @@ struct Info : public amd::EmbeddedObject {
//! that execute in parallel. All work items from the same work group must be
//! executed by SIMDs in the same compute unit.
cl_uint simdPerCU_;
+ cl_uint cuPerShaderArray_; //!< Number of CUs per shader array
//! The maximum number of work items from the same work group that can be
//! executed by a SIMD in parallel
cl_uint simdWidth_;
diff --git a/rocclr/runtime/device/devkernel.cpp b/rocclr/runtime/device/devkernel.cpp
index 9c4b43f960..12ec537d31 100644
--- a/rocclr/runtime/device/devkernel.cpp
+++ b/rocclr/runtime/device/devkernel.cpp
@@ -23,7 +23,43 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
namespace device {
- // ================================================================================================
+// ================================================================================================
+Kernel::Kernel(const amd::Device& dev, const std::string& name)
+ : dev_(dev)
+ , name_(name)
+ , signature_(nullptr)
+ , waveLimiter_(this, dev.info().cuPerShaderArray_ * dev.info().cuPerShaderArray_) {
+ // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
+ // Due to std::string not being able to be memset to 0
+ workGroupInfo_.size_ = 0;
+ workGroupInfo_.compileSize_[0] = 0;
+ workGroupInfo_.compileSize_[1] = 0;
+ workGroupInfo_.compileSize_[2] = 0;
+ workGroupInfo_.localMemSize_ = 0;
+ workGroupInfo_.preferredSizeMultiple_ = 0;
+ workGroupInfo_.privateMemSize_ = 0;
+ workGroupInfo_.scratchRegs_ = 0;
+ workGroupInfo_.wavefrontPerSIMD_ = 0;
+ workGroupInfo_.wavefrontSize_ = 0;
+ workGroupInfo_.availableGPRs_ = 0;
+ workGroupInfo_.usedGPRs_ = 0;
+ workGroupInfo_.availableSGPRs_ = 0;
+ workGroupInfo_.usedSGPRs_ = 0;
+ workGroupInfo_.availableVGPRs_ = 0;
+ workGroupInfo_.usedVGPRs_ = 0;
+ workGroupInfo_.availableLDSSize_ = 0;
+ workGroupInfo_.usedLDSSize_ = 0;
+ workGroupInfo_.availableStackSize_ = 0;
+ workGroupInfo_.usedStackSize_ = 0;
+ workGroupInfo_.compileSizeHint_[0] = 0;
+ workGroupInfo_.compileSizeHint_[1] = 0;
+ workGroupInfo_.compileSizeHint_[2] = 0;
+ workGroupInfo_.compileVecTypeHint_ = "";
+ workGroupInfo_.uniformWorkGroupSize_ = false;
+ workGroupInfo_.wavesPerSimdHint_ = 0;
+}
+
+// ================================================================================================
bool Kernel::createSignature(
const parameters_t& params, uint32_t numParameters,
uint32_t version) {
diff --git a/rocclr/runtime/device/devkernel.hpp b/rocclr/runtime/device/devkernel.hpp
index 94200a92b0..59f7733fa9 100644
--- a/rocclr/runtime/device/devkernel.hpp
+++ b/rocclr/runtime/device/devkernel.hpp
@@ -7,6 +7,7 @@
#include "platform/context.hpp"
#include "platform/object.hpp"
#include "platform/memory.hpp"
+#include "devwavelimiter.hpp"
#if defined(WITH_LIGHTNING_COMPILER)
namespace llvm {
@@ -37,10 +38,6 @@ class Device;
class KernelSignature;
class NDRange;
-struct ProfilingCallback : public amd::HeapObject {
- virtual void callback(ulong duration, uint32_t waves) = 0;
-};
-
struct KernelParameterDescriptor {
enum {
Value = 0,
@@ -124,39 +121,7 @@ class Kernel : public amd::HeapObject {
};
//! Default constructor
- Kernel(const amd::Device& dev, const std::string& name)
- : dev_(dev)
- , name_(name)
- , signature_(nullptr) {
- // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_));
- // Due to std::string not being able to be memset to 0
- workGroupInfo_.size_ = 0;
- workGroupInfo_.compileSize_[0] = 0;
- workGroupInfo_.compileSize_[1] = 0;
- workGroupInfo_.compileSize_[2] = 0;
- workGroupInfo_.localMemSize_ = 0;
- workGroupInfo_.preferredSizeMultiple_ = 0;
- workGroupInfo_.privateMemSize_ = 0;
- workGroupInfo_.scratchRegs_ = 0;
- workGroupInfo_.wavefrontPerSIMD_ = 0;
- workGroupInfo_.wavefrontSize_ = 0;
- workGroupInfo_.availableGPRs_ = 0;
- workGroupInfo_.usedGPRs_ = 0;
- workGroupInfo_.availableSGPRs_ = 0;
- workGroupInfo_.usedSGPRs_ = 0;
- workGroupInfo_.availableVGPRs_ = 0;
- workGroupInfo_.usedVGPRs_ = 0;
- workGroupInfo_.availableLDSSize_ = 0;
- workGroupInfo_.usedLDSSize_ = 0;
- workGroupInfo_.availableStackSize_ = 0;
- workGroupInfo_.usedStackSize_ = 0;
- workGroupInfo_.compileSizeHint_[0] = 0;
- workGroupInfo_.compileSizeHint_[1] = 0;
- workGroupInfo_.compileSizeHint_[2] = 0;
- workGroupInfo_.compileVecTypeHint_ = "";
- workGroupInfo_.uniformWorkGroupSize_ = false;
- workGroupInfo_.wavesPerSimdHint_ = 0;
- }
+ Kernel(const amd::Device& dev, const std::string& name);
//! Default destructor
virtual ~Kernel();
@@ -196,13 +161,14 @@ class Kernel : public amd::HeapObject {
size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; }
//! Get profiling callback object
- virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) {
- return nullptr;
- }
+ amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) {
+ return waveLimiter_.getProfilingCallback(vdev);
+ };
- virtual uint getWavesPerSH(const device::VirtualDevice* vdv) const {
- return 0;
- }
+ //! Get waves per shader array to be used for kernel execution.
+ uint getWavesPerSH(const device::VirtualDevice* vdev) const {
+ return waveLimiter_.getWavesPerSH(vdev);
+ };
//! Returns GPU device object, associated with this kernel
const amd::Device& dev() const { return dev_; }
@@ -272,6 +238,7 @@ class Kernel : public amd::HeapObject {
amd::KernelSignature* signature_; //!< kernel signature
std::string buildLog_; //!< build log
std::vector printf_; //!< Format strings for GPU printf support
+ WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
union Flags {
struct {
diff --git a/rocclr/runtime/device/pal/palwavelimiter.cpp b/rocclr/runtime/device/devwavelimiter.cpp
similarity index 80%
rename from rocclr/runtime/device/pal/palwavelimiter.cpp
rename to rocclr/runtime/device/devwavelimiter.cpp
index 75cb0811cc..67d2380ffb 100644
--- a/rocclr/runtime/device/pal/palwavelimiter.cpp
+++ b/rocclr/runtime/device/devwavelimiter.cpp
@@ -1,20 +1,22 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
-#include "device/pal/palkernel.hpp"
-#include "device/pal/palwavelimiter.hpp"
+#include "platform/command.hpp"
+#include "device/devkernel.hpp"
+#include "device/devwavelimiter.hpp"
#include "os/os.hpp"
#include "utils/flags.hpp"
#include
using namespace std;
-namespace pal {
+namespace device {
uint WaveLimiter::MaxWave;
uint WaveLimiter::RunCount;
uint WaveLimiter::AdaptCount;
+// ================================================================================================
WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump)
: manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
@@ -36,12 +38,14 @@ WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable,
numContinuousSamples_ = 0;
}
+// ================================================================================================
WaveLimiter::~WaveLimiter() {
if (traceStream_.is_open()) {
traceStream_.close();
}
}
+// ================================================================================================
uint WaveLimiter::getWavesPerSH() {
// Generate different wave counts in the adaptation mode
if ((state_ == ADAPT) && (sampleCount_ < AdaptCount)) {
@@ -66,6 +70,7 @@ uint WaveLimiter::getWavesPerSH() {
return waves_ * SIMDPerSH_;
}
+// ================================================================================================
WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
bool enableDump)
: WaveLimiter(manager, seqNum, enable, enableDump) {
@@ -78,8 +83,10 @@ WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, b
clearData();
}
+// ================================================================================================
WLAlgorithmSmooth::~WLAlgorithmSmooth() {}
+// ================================================================================================
void WLAlgorithmSmooth::clearData() {
waves_ = MaxWave;
countAll_ = 0;
@@ -88,10 +95,11 @@ void WLAlgorithmSmooth::clearData() {
dataCount_ = 0;
}
+// ================================================================================================
void WLAlgorithmSmooth::updateData(ulong time) {
-
}
+// ================================================================================================
void WLAlgorithmSmooth::outputTrace() {
if (!traceStream_.is_open()) {
return;
@@ -114,7 +122,7 @@ void WLAlgorithmSmooth::outputTrace() {
traceStream_ << "\n\n";
}
-
+// ================================================================================================
void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) {
dumper_.addData(duration, waves, static_cast(state_));
@@ -212,6 +220,7 @@ void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) {
}
}
+// ================================================================================================
WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) {
enable_ = enable;
if (enable_) {
@@ -219,6 +228,7 @@ WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable)
}
}
+// ================================================================================================
WaveLimiter::DataDumper::~DataDumper() {
if (!enable_) {
return;
@@ -232,6 +242,7 @@ WaveLimiter::DataDumper::~DataDumper() {
OFS.close();
}
+// ================================================================================================
void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
if (!enable_) {
return;
@@ -242,18 +253,24 @@ void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
state_.push_back(state);
}
+// ================================================================================================
WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH)
: owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
- setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH);
+ setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, ((simdPerSH == 0) ? 1 : simdPerSH));
fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
}
+// ================================================================================================
WaveLimiterManager::~WaveLimiterManager() {
for (auto& I : limiters_) {
delete I.second;
}
}
+// ================================================================================================
+const std::string& WaveLimiterManager::name() const { return owner_->name(); }
+
+// ================================================================================================
uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const {
if (fixed_ > 0) {
return fixed_;
@@ -291,7 +308,8 @@ amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
return limiter;
}
-void WaveLimiterManager::enable() {
+// ================================================================================================
+void WaveLimiterManager::enable(bool isSupported) {
if (fixed_ > 0) {
return;
}
@@ -300,7 +318,7 @@ void WaveLimiterManager::enable() {
// Disabled for SI due to bug #10817
if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
enable_ = GPU_WAVE_LIMIT_ENABLE;
- } else {
+ } else if (isSupported) {
if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
enable_ = true;
} else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
diff --git a/rocclr/runtime/device/pal/palwavelimiter.hpp b/rocclr/runtime/device/devwavelimiter.hpp
similarity index 90%
rename from rocclr/runtime/device/pal/palwavelimiter.hpp
rename to rocclr/runtime/device/devwavelimiter.hpp
index 6caea9eb79..e13aaee8c0 100644
--- a/rocclr/runtime/device/pal/palwavelimiter.hpp
+++ b/rocclr/runtime/device/devwavelimiter.hpp
@@ -3,7 +3,6 @@
//
#pragma once
-#include "platform/command.hpp"
#include "thread/thread.hpp"
#include
#include
@@ -11,11 +10,17 @@
#include
#include
+namespace amd {
+ struct ProfilingCallback : public amd::HeapObject {
+ virtual void callback(ulong duration, uint32_t waves) = 0;
+ };
+}
+
//! \namespace pal PAL Device Implementation
-namespace pal {
+namespace device {
class WaveLimiterManager;
-class HSAILKernel;
+class Kernel;
// Adaptively limit the number of waves per SIMD based on kernel execution time
class WaveLimiter : public amd::ProfilingCallback {
@@ -120,20 +125,20 @@ class WLAlgorithmSmooth : public WaveLimiter {
// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
class WaveLimiterManager {
public:
- explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH);
+ explicit WaveLimiterManager(Kernel* owner, const uint simdPerSH);
virtual ~WaveLimiterManager();
//! Get waves per shader array for a specific virtual device.
- uint getWavesPerSH(const device::VirtualDevice*) const;
+ uint getWavesPerSH(const VirtualDevice*) const;
//! Provide call back function for a specific virtual device.
- amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice*);
+ amd::ProfilingCallback* getProfilingCallback(const VirtualDevice*);
//! Enable wave limiter manager by kernel metadata and flags.
- void enable();
+ void enable(bool isSupported = true);
//! Returns the kernel name
- const std::string& name() const { return owner_->name(); }
+ const std::string& name() const;
//! Get SimdPerSH.
uint getSimdPerSH() const { return simdPerSH_; }
@@ -141,7 +146,7 @@ class WaveLimiterManager {
private:
device::Kernel* owner_; // The kernel which owns this object
uint simdPerSH_; // Simd Per SH
- std::unordered_map
+ std::unordered_map
limiters_; // Maps virtual device to wave limiter
bool enable_; // Whether the adaptation is enabled
bool enableDump_; // Whether the data dumper is enabled
diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp
index 2d95f4eae7..de937f6e5a 100644
--- a/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -565,6 +565,7 @@ void NullDevice::fillDeviceInfo(const CALdeviceattribs& calAttr, const gslMemInf
info_.deviceTopology_.pcie.function = (calAttr.pciTopologyInformation & 0x07);
info_.simdPerCU_ = hwInfo()->simdPerCU_;
+ info_.cuPerShaderArray_ = calAttr.numberOfCUsperShaderArray;
info_.simdWidth_ = hwInfo()->simdWidth_;
info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
info_.wavefrontWidth_ = calAttr.wavefrontSize;
diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index fe88dbd499..68d7a777eb 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -810,9 +810,7 @@ bool Kernel::create(const std::string& code, const std::string& metadata, const
Kernel::Kernel(const std::string& name, const Device& gpuDev, const Program& prog,
const InitData* initData)
- : NullKernel(name, gpuDev, prog),
- waveLimiter_(this,
- dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_) {
+ : NullKernel(name, gpuDev, prog) {
hwPrivateSize_ = 0;
if (NULL != initData) {
flags_ = initData->flags_;
@@ -3054,9 +3052,7 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi
code_(NULL),
codeSize_(0),
hwMetaData_(NULL),
- extraArgumentsNum_(extraArgsNum),
- waveLimiter_(this, (prog->isNull() ? 1 : dev().getAttribs().numberOfCUsperShaderArray) *
- dev().hwInfo()->simdPerCU_) {
+ extraArgumentsNum_(extraArgsNum) {
flags_.hsa_ = true;
}
diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp
index 544cc9e9e7..a60ada3dad 100644
--- a/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -15,7 +15,7 @@
#include "device/gpu/gpuvirtual.hpp"
#include "amd_hsa_kernel_code.h"
#include "device/gpu/gpuprintf.hpp"
-#include "device/gpu/gpuwavelimiter.hpp"
+#include "device/devwavelimiter.hpp"
#include "hsa.h"
namespace amd {
@@ -608,11 +608,6 @@ class Kernel : public NullKernel {
VirtualGPU::GslKernelDesc* desc //!< Kernel descriptor
) const;
- //! Get profiling callback object
- virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) {
- return waveLimiter_.getProfilingCallback(vdev);
- }
-
protected:
//! Initializes the kernel parameters for the abstraction layer
bool initParameters();
@@ -707,8 +702,6 @@ class Kernel : public NullKernel {
uint hwPrivateSize_; //!< initial HW private size
uint hwLocalSize_; //!< initial HW local size
-
- WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
};
enum HSAIL_ADDRESS_QUALIFIER {
@@ -833,16 +826,6 @@ class HSAILKernel : public device::Kernel {
//! Returns kernel's extra argument count
uint extraArgumentsNum() const { return extraArgumentsNum_; }
- //! Get profiling callback object
- virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) {
- return waveLimiter_.getProfilingCallback(vdev);
- }
-
- //! Get waves per shader array to be used for kernel execution.
- uint getWavesPerSH(const device::VirtualDevice* vdev) const {
- return waveLimiter_.getWavesPerSH(vdev);
- }
-
private:
//! Disable copy constructor
HSAILKernel(const HSAILKernel&);
@@ -873,8 +856,6 @@ class HSAILKernel : public device::Kernel {
char* hwMetaData_; //!< SI metadata
uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments
-
- WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
};
/*@}*/} // namespace gpu
diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp b/rocclr/runtime/device/gpu/gpuwavelimiter.cpp
deleted file mode 100644
index b110f41633..0000000000
--- a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp
+++ /dev/null
@@ -1,276 +0,0 @@
-//
-// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
-//
-
-#include "device/gpu/gpukernel.hpp"
-#include "device/gpu/gpuwavelimiter.hpp"
-#include "os/os.hpp"
-#include "utils/flags.hpp"
-
-#include
-using namespace std;
-
-namespace gpu {
-
-uint WaveLimiter::MaxWave;
-uint WaveLimiter::WarmUpCount;
-uint WaveLimiter::RunCount;
-uint WLAlgorithmSmooth::AdaptCount;
-uint WLAlgorithmSmooth::AbandonThresh;
-uint WLAlgorithmSmooth::DscThresh;
-
-WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump)
- : manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) {
- setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH());
- MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
- WarmUpCount = GPU_WAVE_LIMIT_WARMUP;
- RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
-
- state_ = WARMUP;
- if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
- traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + ".txt");
- }
-
- waves_ = MaxWave;
- currWaves_ = MaxWave;
- bestWave_ = MaxWave;
- enable_ = enable;
-}
-
-WaveLimiter::~WaveLimiter() {
- if (traceStream_.is_open()) {
- traceStream_.close();
- }
-}
-
-uint WaveLimiter::getWavesPerSH() {
- currWaves_ = waves_;
- return waves_ * SIMDPerSH_;
-}
-
-WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
- bool enableDump)
- : WaveLimiter(manager, seqNum, enable, enableDump) {
- AdaptCount = 2 * MaxWave + 1;
- AbandonThresh = GPU_WAVE_LIMIT_ABANDON;
- DscThresh = GPU_WAVE_LIMIT_DSC_THRESH;
-
- dynRunCount_ = RunCount;
- measure_.resize(MaxWave + 1);
- reference_.resize(MaxWave + 1);
- trial_.resize(MaxWave + 1);
- ratio_.resize(MaxWave + 1);
-
- clearData();
-}
-
-WLAlgorithmSmooth::~WLAlgorithmSmooth() {}
-
-void WLAlgorithmSmooth::clearData() {
- waves_ = MaxWave;
- countAll_ = 0;
- clear(measure_);
- clear(reference_);
- clear(trial_);
- clear(ratio_);
- discontinuous_ = false;
- dataCount_ = 0;
-}
-
-void WLAlgorithmSmooth::updateData(ulong time) {
- auto count = dataCount_ - 1;
- assert(count < 2 * MaxWave + 1);
- assert(time > 0);
- assert(currWaves_ == waves_);
- if (count % 2 == 0) {
- assert(waves_ == MaxWave);
- auto pos = count / 2;
- measure_[pos] = time;
- if (pos > 0) {
- auto wave = MaxWave + 1 - pos;
- if (abs(static_cast(measure_[pos - 1]) - static_cast(measure_[pos])) * 100 /
- measure_[pos] >
- DscThresh) {
- discontinuous_ = true;
- }
- reference_[wave] = (time + measure_[pos - 1]) / 2;
- ratio_[wave] = trial_[wave] * 100 / reference_[wave];
- if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) {
- bestWave_ = wave;
- }
- }
- } else {
- assert(waves_ == MaxWave - count / 2);
- trial_[waves_] = time;
- }
- outputTrace();
-}
-
-void WLAlgorithmSmooth::outputTrace() {
- if (!traceStream_.is_open()) {
- return;
- }
-
- traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_
- << " currWaves=" << currWaves_ << " waves=" << waves_ << " bestWave=" << bestWave_
- << '\n';
- output(traceStream_, "\n measure = ", measure_);
- output(traceStream_, "\n reference = ", reference_);
- output(traceStream_, "\n ratio = ", ratio_);
- traceStream_ << "\n\n";
-}
-
-
-void WLAlgorithmSmooth::callback(ulong duration, uint32_t waves) {
- dumper_.addData(duration, currWaves_, static_cast(state_));
-
- if (!enable_ || (duration == 0)) {
- return;
- }
-
- countAll_++;
-
- switch (state_) {
- case WARMUP:
- if (countAll_ < WarmUpCount) {
- return;
- }
- state_ = ADAPT;
- bestWave_ = MaxWave;
- clearData();
- return;
- case ADAPT:
- assert(duration > 0);
- if (waves_ == currWaves_) {
- dataCount_++;
- updateData(duration);
- waves_ = MaxWave + 1 - dataCount_ / 2;
- if (dataCount_ == 1 || (dataCount_ < AdaptCount && !discontinuous_ &&
- (dataCount_ % 2 == 0 || ratio_[waves_] < AbandonThresh))) {
- if (dataCount_ % 2 == 1) {
- --waves_;
- } else {
- waves_ = MaxWave;
- }
- return;
- }
- waves_ = bestWave_;
- if (dataCount_ >= AdaptCount) {
- dynRunCount_ = RunCount;
- } else {
- dynRunCount_ = AdaptCount;
- }
- countAll_ = rand() % MaxWave;
- state_ = RUN;
- }
- return;
- case RUN:
- if (countAll_ < dynRunCount_) {
- return;
- }
- state_ = ADAPT;
- bestWave_ = MaxWave;
- clearData();
- return;
- }
-}
-
-WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) {
- enable_ = enable;
- if (enable_) {
- fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv";
- }
-}
-
-WaveLimiter::DataDumper::~DataDumper() {
- if (!enable_) {
- return;
- }
-
- std::ofstream OFS(fileName_);
- for (size_t i = 0, e = time_.size(); i != e; ++i) {
- OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' << static_cast(state_[i])
- << '\n';
- }
- OFS.close();
-}
-
-void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
- if (!enable_) {
- return;
- }
-
- time_.push_back(time);
- wavePerSIMD_.push_back(wave);
- state_.push_back(state);
-}
-
-WaveLimiterManager::WaveLimiterManager(device::Kernel* kernel, const uint simdPerSH)
- : owner_(kernel), enable_(false), enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
- setIfNotDefault(simdPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, simdPerSH);
- fixed_ = GPU_WAVES_PER_SIMD * simdPerSH_;
-}
-
-WaveLimiterManager::~WaveLimiterManager() {
- for (auto& I : limiters_) {
- delete I.second;
- }
-}
-
-uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const {
- if (fixed_ > 0) {
- return fixed_;
- }
- if (!enable_) {
- return 0;
- }
- auto loc = limiters_.find(vdev);
- if (loc == limiters_.end()) {
- return 0;
- }
- assert(loc->second != NULL);
- return loc->second->getWavesPerSH();
-}
-
-amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
- const device::VirtualDevice* vdev) {
- assert(vdev != NULL);
- if (!enable_ && !enableDump_) {
- return NULL;
- }
-
- amd::ScopedLock SL(monitor_);
- auto loc = limiters_.find(vdev);
- if (loc != limiters_.end()) {
- return loc->second;
- }
-
- auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_);
- if (limiter == NULL) {
- enable_ = false;
- return NULL;
- }
- limiters_[vdev] = limiter;
- return limiter;
-}
-
-void WaveLimiterManager::enable(const bool isCiPlus) {
- if (fixed_ > 0) {
- return;
- }
-
- // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
- // Disabled for SI due to bug #10817
- if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) {
- enable_ = GPU_WAVE_LIMIT_ENABLE;
- } else {
- if (isCiPlus) {
- if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) {
- enable_ = true;
- } else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) {
- fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH();
- }
- }
- }
-}
-}
diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp b/rocclr/runtime/device/gpu/gpuwavelimiter.hpp
deleted file mode 100644
index 570a457d62..0000000000
--- a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-//
-// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
-//
-
-#ifndef GPUWAVELIMITER_HPP_
-#define GPUWAVELIMITER_HPP_
-
-#include "platform/command.hpp"
-#include "thread/thread.hpp"
-#include
-#include
-#include
-#include
-#include
-
-//! \namespace gpu GPU Device Implementation
-namespace gpu {
-
-class WaveLimiterManager;
-
-// Adaptively limit the number of waves per SIMD based on kernel execution time
-class WaveLimiter : public amd::ProfilingCallback {
- public:
- explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump);
- virtual ~WaveLimiter();
-
- //! Get waves per shader array to be used for kernel execution.
- uint getWavesPerSH();
-
- protected:
- enum StateKind { WARMUP, ADAPT, RUN };
-
- class DataDumper {
- public:
- explicit DataDumper(const std::string& kernelName, bool enable);
- ~DataDumper();
-
- //! Record execution time, waves/simd and state of wave limiter.
- void addData(ulong time, uint wave, char state);
-
- //! Whether this data dumper is enabled.
- bool enabled() const { return enable_; }
-
- private:
- bool enable_;
- std::string fileName_;
- std::vector time_;
- std::vector wavePerSIMD_;
- std::vector state_;
- };
-
- std::vector measure_;
- bool enable_;
- uint SIMDPerSH_; // Number of SIMDs per SH
- uint waves_; // Waves per SIMD to be set
- uint bestWave_; // Optimal waves per SIMD
- uint countAll_; // Number of kernel executions
- StateKind state_;
- WaveLimiterManager* manager_;
- DataDumper dumper_;
- std::ofstream traceStream_;
- uint currWaves_; // Current waves per SIMD
-
- static uint MaxWave; // Maximum number of waves per SIMD
- static uint WarmUpCount; // Number of kernel executions for warm up
- static uint RunCount; // Number of kernel executions for normal run
-
- //! Call back from Event::recordProfilingInfo to get execution time.
- virtual void callback(ulong duration, uint32_t waves) = 0;
-
- //! Output trace of measurement/adaptation.
- virtual void outputTrace() = 0;
-
- template void clear(T& A) {
- for (auto& I : A) {
- I = 0;
- }
- }
- template void output(std::ofstream& ofs, const std::string& prompt, T& A) {
- ofs << prompt;
- for (auto& I : A) {
- ofs << ' ' << static_cast(I);
- }
- }
-};
-
-class WLAlgorithmSmooth : public WaveLimiter {
- public:
- explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable,
- bool enableDump);
- virtual ~WLAlgorithmSmooth();
-
- private:
- std::vector reference_;
- std::vector trial_;
- std::vector ratio_;
- bool discontinuous_; // Measured data is discontinuous
- uint dynRunCount_;
- uint dataCount_;
-
- static uint AdaptCount; // Number of kernel executions for adapting
- static uint AbandonThresh; // Threshold to abandon adaptation
- static uint DscThresh; // Threshold for identifying discontinuities
-
- //! Update measurement data and optimal waves/simd with execution time.
- void updateData(ulong time);
-
- //! Clear measurement data for the next adaptation.
- void clearData();
-
- //! Call back from Event::recordProfilingInfo to get execution time.
- void callback(ulong duration, uint32_t waves);
-
- //! Output trace of measurement/adaptation.
- void outputTrace();
-};
-
-// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
-class WaveLimiterManager {
- public:
- explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH);
- virtual ~WaveLimiterManager();
-
- //! Get waves per shader array for a specific virtual device.
- uint getWavesPerSH(const device::VirtualDevice*) const;
-
- //! Provide call back function for a specific virtual device.
- amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice*);
-
- //! Enable wave limiter manager by kernel metadata and flags.
- void enable(const bool isCiPlus);
-
- //! Returns the kernel name
- const std::string& name() const { return owner_->name(); }
-
- //! Get SimdPerSH.
- uint getSimdPerSH() const { return simdPerSH_; }
-
- private:
- device::Kernel* owner_; // The kernel which owns this object
- uint simdPerSH_; // Simd Per SH
- std::unordered_map
- limiters_; // Maps virtual device to wave limiter
- bool enable_; // Whether the adaptation is enabled
- bool enableDump_; // Whether the data dumper is enabled
- uint fixed_; // The fixed waves/simd value if not zero
- amd::Monitor monitor_; // The mutex for updating the wave limiter map
-};
-}
-#endif
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 5d0a0bcd03..aaa4987605 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -574,6 +574,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
info_.deviceTopology_.pcie.function = palProp.pciProperties.functionNumber;
info_.simdPerCU_ = hwInfo()->simdPerCU_;
+ info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray;
info_.simdWidth_ = hwInfo()->simdWidth_;
info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
info_.wavefrontWidth_ = palProp.gfxipProperties.shaderCore.wavefrontSize;
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index 7c330fbbd4..fbfe429231 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -74,12 +74,7 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi
prog_(*prog),
index_(0),
code_(0),
- codeSize_(0),
- waveLimiter_(
- this,
- (prog->isNull() ? 1
- : dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) *
- dev().hwInfo()->simdPerCU_) {
+ codeSize_(0) {
flags_.hsa_ = true;
}
diff --git a/rocclr/runtime/device/pal/palkernel.hpp b/rocclr/runtime/device/pal/palkernel.hpp
index 43fc6ff185..c7477697c0 100644
--- a/rocclr/runtime/device/pal/palkernel.hpp
+++ b/rocclr/runtime/device/pal/palkernel.hpp
@@ -13,7 +13,7 @@
#include "device/pal/palvirtual.hpp"
#include "amd_hsa_kernel_code.h"
#include "device/pal/palprintf.hpp"
-#include "device/pal/palwavelimiter.hpp"
+#include "device/devwavelimiter.hpp"
#include "hsa.h"
#if defined(WITH_LIGHTNING_COMPILER)
@@ -98,16 +98,6 @@ class HSAILKernel : public device::Kernel {
//! Returns the kernel index in the program
uint index() const { return index_; }
- //! Get profiling callback object
- virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) {
- return waveLimiter_.getProfilingCallback(vdev);
- };
-
- //! Get waves per shader array to be used for kernel execution.
- virtual uint getWavesPerSH(const device::VirtualDevice* vdev) const {
- return waveLimiter_.getWavesPerSH(vdev);
- };
-
private:
//! Disable copy constructor
HSAILKernel(const HSAILKernel&);
@@ -126,8 +116,6 @@ class HSAILKernel : public device::Kernel {
uint64_t code_; //!< GPU memory pointer to the kernel
size_t codeSize_; //!< Size of ISA code
-
- WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
};
#if defined(WITH_LIGHTNING_COMPILER)