P4 to Git Change 1538923 by vsytchen@vsytchen-ocl-win10 on 2018/04/10 12:18:20

SWDEV-150166 - [OCL][LC][Ellesmere][Windows] oclcompiler.OCLStructOperations[30] causing crash

	1. Add runtime flag GPU_MAX_COMMAND_BUFFERS to set the amount of command buffers allocated per queue for pal devices.

	ReviewBoardURL = http://ocltc.amd.com/reviews/r/14572/diff/

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#49 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#18 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#88 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#47 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#288 edit


[ROCm/clr commit: 19332a7d5e]
This commit is contained in:
foreman
2018-04-10 12:34:33 -04:00
parent a9c0878566
commit dc5e6fb1e2
5 ha cambiato i file con 36 aggiunte e 26 eliminazioni
@@ -141,6 +141,9 @@ Settings::Settings() {
subAllocationChunkSize_ = 64 * Mi;
subAllocationMaxSize_ =
std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
// Note: More command buffers may cause a HW hang with HWSC on VI family in OCLPerfKernelArguments
maxCmdBuffers_ = 8;
}
bool Settings::create(const Pal::DeviceProperties& palProp,
@@ -495,6 +498,10 @@ void Settings::override() {
break;
}
}
if (!flagIsDefault(GPU_MAX_COMMAND_BUFFERS)) {
maxCmdBuffers_ = GPU_MAX_COMMAND_BUFFERS;
}
}
} // namespace pal
@@ -95,6 +95,7 @@ class Settings : public device::Settings {
size_t numMemDependencies_; //!< The array size for memory dependencies tracking
uint64_t maxAllocSize_; //!< Maximum single allocation size
uint rgpSqttDispCount_; //!< The number of dispatches captured in SQTT
uint maxCmdBuffers_; //!< Maximum number of command buffers allocated per queue
uint64_t subAllocationMinSize_; //!< Minimum size allowed for suballocations
uint64_t subAllocationMaxSize_; //!< Maximum size allowed with suballocations
@@ -36,7 +36,7 @@ namespace pal {
VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueType queueType,
uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
uint rtCU, amd::CommandQueue::Priority priority,
uint64_t residency_limit) {
uint64_t residency_limit, uint max_command_buffers) {
Pal::Result result;
Pal::CmdBufferCreateInfo cmdCreateInfo = {};
Pal::QueueCreateInfo qCreateInfo = {};
@@ -81,8 +81,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp
return nullptr;
}
size_t allocSize = qSize + MaxCmdBuffers * (cmdSize + fSize);
VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit);
size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit, max_command_buffers);
if (queue != nullptr) {
address addrQ = reinterpret_cast<address>(&queue[1]);
// Create PAL queue object
@@ -93,10 +93,10 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp
}
queue->UpdateAppPowerProfile();
address addrCmd = addrQ + qSize;
address addrF = addrCmd + MaxCmdBuffers * cmdSize;
address addrF = addrCmd + max_command_buffers * cmdSize;
Pal::CmdBufferBuildInfo cmdBuildInfo = {};
for (uint i = 0; i < MaxCmdBuffers; ++i) {
for (uint i = 0; i < max_command_buffers; ++i) {
result = palDev->CreateCmdBuffer(cmdCreateInfo, &addrCmd[i * cmdSize], &queue->iCmdBuffs_[i]);
if (result != Pal::Result::Success) {
delete queue;
@@ -133,7 +133,7 @@ VirtualGPU::Queue::~Queue() {
}
memReferences_.clear();
for (uint i = 0; i < MaxCmdBuffers; ++i) {
for (uint i = 0; i < max_command_buffers_; ++i) {
if (nullptr != iCmdBuffs_[i]) {
iCmdBuffs_[i]->Destroy();
}
@@ -274,14 +274,14 @@ bool VirtualGPU::Queue::flush() {
}
// Wrap current slot
cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers;
cmdBufIdSlot_ = cmdBufIdCurrent_ % max_command_buffers_;
waifForFence<IbReuse>(cmdBufIdSlot_);
// Progress retired TS
if ((cmdBufIdCurrent_ > MaxCmdBuffers) &&
(cmbBufIdRetired_ < (cmdBufIdCurrent_ - MaxCmdBuffers))) {
cmbBufIdRetired_ = cmdBufIdCurrent_ - MaxCmdBuffers;
if ((cmdBufIdCurrent_ > max_command_buffers_) &&
(cmbBufIdRetired_ < (cmdBufIdCurrent_ - max_command_buffers_))) {
cmbBufIdRetired_ = cmdBufIdCurrent_ - max_command_buffers_;
}
// Reset command buffer, so CB chunks could be reused
@@ -326,7 +326,7 @@ bool VirtualGPU::Queue::waitForEvent(uint id) {
return true;
}
uint slotId = id % MaxCmdBuffers;
uint slotId = id % max_command_buffers_;
constexpr bool IbReuse = true;
bool result = waifForFence<!IbReuse>(slotId);
cmbBufIdRetired_ = id;
@@ -343,7 +343,7 @@ bool VirtualGPU::Queue::isDone(uint id) {
flush();
}
if (Pal::Result::Success != iCmdFences_[id % MaxCmdBuffers]->GetStatus()) {
if (Pal::Result::Success != iCmdFences_[id % max_command_buffers_]->GetStatus()) {
return false;
}
cmbBufIdRetired_ = id;
@@ -784,6 +784,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
uint idx = index() % (dev().numComputeEngines() - firstQueue);
uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs ? 0 :
(dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
uint max_cmd_buffers = dev().settings().maxCmdBuffers_;
if (dev().numComputeEngines()) {
//! @todo There is a hang with a mix of user and non user queues.
@@ -795,7 +796,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
queues_[MainEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue,
cmdAllocator_, rtCUs, priority,
residency_limit);
residency_limit, max_cmd_buffers);
if (nullptr == queues_[MainEngine]) {
return false;
}
@@ -813,14 +814,14 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
queues_[SdmaEngine] =
Queue::Create(dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_,
amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal,
residency_limit);
residency_limit, max_cmd_buffers);
if (nullptr == queues_[SdmaEngine]) {
return false;
}
} else {
queues_[SdmaEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute,
idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal,
residency_limit);
residency_limit, max_cmd_buffers);
if (nullptr == queues_[SdmaEngine]) {
return false;
}
@@ -41,8 +41,6 @@ class VirtualGPU : public device::VirtualDevice {
public:
class Queue : public amd::HeapObject {
public:
// Note: More command buffers may cause a HW hang with HWSC on VI family in OCLPerfKernelArguments
static const uint MaxCmdBuffers = 8;
static const uint MaxCommands = 256;
static const uint StartCmdBufIdx = 1;
static const uint FirstMemoryReference = 0x80000000;
@@ -58,11 +56,14 @@ class VirtualGPU : public device::VirtualDevice {
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
uint rtCU, //!< The number of reserved CUs
amd::CommandQueue::Priority priority, //!< Queue priority
uint64_t residency_limit //!< Enables residency limit
uint64_t residency_limit, //!< Enables residency limit
uint max_command_buffers //!< Number of allocated command buffers
);
Queue(Pal::IDevice* palDev, uint64_t residency_limit)
Queue(Pal::IDevice* palDev, uint64_t residency_limit, uint max_command_buffers)
: iQueue_(nullptr),
iCmdBuffs_(max_command_buffers, nullptr),
iCmdFences_(max_command_buffers, nullptr),
last_kernel_(nullptr),
iDev_(palDev),
cmdBufIdSlot_(StartCmdBufIdx),
@@ -71,12 +72,9 @@ class VirtualGPU : public device::VirtualDevice {
cmdCnt_(0),
vlAlloc_(64 * Ki),
residency_size_(0),
residency_limit_(residency_limit)
residency_limit_(residency_limit),
max_command_buffers_(max_command_buffers)
{
for (uint i = 0; i < MaxCmdBuffers; ++i) {
iCmdBuffs_[i] = nullptr;
iCmdFences_[i] = nullptr;
}
vlAlloc_.Init();
}
@@ -152,8 +150,8 @@ class VirtualGPU : public device::VirtualDevice {
uint cmdBufId() const { return cmdBufIdCurrent_; }
Pal::IQueue* iQueue_; //!< PAL queue object
Pal::ICmdBuffer* iCmdBuffs_[MaxCmdBuffers]; //!< PAL command buffers
Pal::IFence* iCmdFences_[MaxCmdBuffers]; //!< PAL fences, associated with CMD
std::vector<Pal::ICmdBuffer*> iCmdBuffs_; //!< PAL command buffers
std::vector<Pal::IFence*> iCmdFences_; //!< PAL fences, associated with CMD
const amd::Kernel* last_kernel_; //!< Last submitted kernel
private:
@@ -172,6 +170,7 @@ class VirtualGPU : public device::VirtualDevice {
std::vector<const Pal::IGpuMemory*> palSdiRefs_;
uint64_t residency_size_; //!< Resource residency size
uint64_t residency_limit_; //!< Enables residency limit
uint max_command_buffers_;
};
struct CommandBatch : public amd::HeapObject {
@@ -217,6 +217,8 @@ release_on_stg(uint, PAL_RGP_DISP_COUNT, 10, \
"The number of dispatches for RGP capture with SQTT") \
release(bool, GPU_FORCE_WAVE_SIZE_32, false, \
"Forces WaveSize32 compilation in SC") \
release(uint, GPU_MAX_COMMAND_BUFFERS, 8, \
"The maximum number of command buffers allocated per queue") \
namespace amd {