From dc5e6fb1e24aedba3b7739035e71782bdab9bba5 Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 10 Apr 2018 12:34:33 -0400 Subject: [PATCH] P4 to Git Change 1538923 by vsytchen@vsytchen-ocl-win10 on 2018/04/10 12:18:20 SWDEV-150166 - [OCL][LC][Ellesmere][Windows] oclcompiler.OCLStructOperations[30] causing crash 1. Add runtime flag GPU_MAX_COMMAND_BUFFERS to set the amount of command buffers allocated per queue for pal devices. ReviewBoardURL = http://ocltc.amd.com/reviews/r/14572/diff/ Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#49 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#18 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#88 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#47 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#288 edit [ROCm/clr commit: 19332a7d5e2ac3f8b5f4180dac55987578921200] --- .../rocclr/runtime/device/pal/palsettings.cpp | 7 +++++ .../rocclr/runtime/device/pal/palsettings.hpp | 1 + .../rocclr/runtime/device/pal/palvirtual.cpp | 31 ++++++++++--------- .../rocclr/runtime/device/pal/palvirtual.hpp | 21 ++++++------- projects/clr/rocclr/runtime/utils/flags.hpp | 2 ++ 5 files changed, 36 insertions(+), 26 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp index e33efd22e0..1bf6e31112 100644 --- a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp @@ -141,6 +141,9 @@ Settings::Settings() { subAllocationChunkSize_ = 64 * Mi; subAllocationMaxSize_ = std::min(static_cast(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_); + + // Note: More command buffers may cause a HW hang with HWSC on VI family in OCLPerfKernelArguments + maxCmdBuffers_ = 8; } bool Settings::create(const Pal::DeviceProperties& palProp, @@ -495,6 +498,10 @@ void Settings::override() { break; } } + + if (!flagIsDefault(GPU_MAX_COMMAND_BUFFERS)) { + maxCmdBuffers_ = GPU_MAX_COMMAND_BUFFERS; + } } } // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp index 90dc533bf6..bb9e2faf71 100644 --- a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp @@ -95,6 +95,7 @@ class Settings : public device::Settings { size_t numMemDependencies_; //!< The array size for memory dependencies tracking uint64_t maxAllocSize_; //!< Maximum single allocation size uint rgpSqttDispCount_; //!< The number of dispatches captured in SQTT + uint maxCmdBuffers_; //!< Maximum number of command buffers allocated per queue uint64_t subAllocationMinSize_; //!< Minimum size allowed for suballocations uint64_t subAllocationMaxSize_; //!< Maximum size allowed with suballocations diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index e24ddc5373..1463b81cad 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -36,7 +36,7 @@ namespace pal { VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueType queueType, uint engineIdx, Pal::ICmdAllocator* cmdAllocator, uint rtCU, amd::CommandQueue::Priority priority, - uint64_t residency_limit) { + uint64_t residency_limit, uint max_command_buffers) { Pal::Result result; Pal::CmdBufferCreateInfo cmdCreateInfo = {}; Pal::QueueCreateInfo qCreateInfo = {}; @@ -81,8 +81,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp return nullptr; } - size_t allocSize = qSize + MaxCmdBuffers * (cmdSize + fSize); - VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit); + size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize); + VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit, max_command_buffers); if (queue != nullptr) { address addrQ = reinterpret_cast
(&queue[1]); // Create PAL queue object @@ -93,10 +93,10 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp } queue->UpdateAppPowerProfile(); address addrCmd = addrQ + qSize; - address addrF = addrCmd + MaxCmdBuffers * cmdSize; + address addrF = addrCmd + max_command_buffers * cmdSize; Pal::CmdBufferBuildInfo cmdBuildInfo = {}; - for (uint i = 0; i < MaxCmdBuffers; ++i) { + for (uint i = 0; i < max_command_buffers; ++i) { result = palDev->CreateCmdBuffer(cmdCreateInfo, &addrCmd[i * cmdSize], &queue->iCmdBuffs_[i]); if (result != Pal::Result::Success) { delete queue; @@ -133,7 +133,7 @@ VirtualGPU::Queue::~Queue() { } memReferences_.clear(); - for (uint i = 0; i < MaxCmdBuffers; ++i) { + for (uint i = 0; i < max_command_buffers_; ++i) { if (nullptr != iCmdBuffs_[i]) { iCmdBuffs_[i]->Destroy(); } @@ -274,14 +274,14 @@ bool VirtualGPU::Queue::flush() { } // Wrap current slot - cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers; + cmdBufIdSlot_ = cmdBufIdCurrent_ % max_command_buffers_; waifForFence(cmdBufIdSlot_); // Progress retired TS - if ((cmdBufIdCurrent_ > MaxCmdBuffers) && - (cmbBufIdRetired_ < (cmdBufIdCurrent_ - MaxCmdBuffers))) { - cmbBufIdRetired_ = cmdBufIdCurrent_ - MaxCmdBuffers; + if ((cmdBufIdCurrent_ > max_command_buffers_) && + (cmbBufIdRetired_ < (cmdBufIdCurrent_ - max_command_buffers_))) { + cmbBufIdRetired_ = cmdBufIdCurrent_ - max_command_buffers_; } // Reset command buffer, so CB chunks could be reused @@ -326,7 +326,7 @@ bool VirtualGPU::Queue::waitForEvent(uint id) { return true; } - uint slotId = id % MaxCmdBuffers; + uint slotId = id % max_command_buffers_; constexpr bool IbReuse = true; bool result = waifForFence(slotId); cmbBufIdRetired_ = id; @@ -343,7 +343,7 @@ bool VirtualGPU::Queue::isDone(uint id) { flush(); } - if (Pal::Result::Success != iCmdFences_[id % MaxCmdBuffers]->GetStatus()) { + if (Pal::Result::Success != iCmdFences_[id % max_command_buffers_]->GetStatus()) { return false; } cmbBufIdRetired_ = id; @@ -784,6 +784,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, uint idx = index() % (dev().numComputeEngines() - firstQueue); uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs ? 0 : (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2); + uint max_cmd_buffers = dev().settings().maxCmdBuffers_; if (dev().numComputeEngines()) { //! @todo There is a hang with a mix of user and non user queues. @@ -795,7 +796,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, queues_[MainEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue, cmdAllocator_, rtCUs, priority, - residency_limit); + residency_limit, max_cmd_buffers); if (nullptr == queues_[MainEngine]) { return false; } @@ -813,14 +814,14 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, queues_[SdmaEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_, amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal, - residency_limit); + residency_limit, max_cmd_buffers); if (nullptr == queues_[SdmaEngine]) { return false; } } else { queues_[SdmaEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal, - residency_limit); + residency_limit, max_cmd_buffers); if (nullptr == queues_[SdmaEngine]) { return false; } diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index 801a8750d6..04b4facc1f 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -41,8 +41,6 @@ class VirtualGPU : public device::VirtualDevice { public: class Queue : public amd::HeapObject { public: - // Note: More command buffers may cause a HW hang with HWSC on VI family in OCLPerfKernelArguments - static const uint MaxCmdBuffers = 8; static const uint MaxCommands = 256; static const uint StartCmdBufIdx = 1; static const uint FirstMemoryReference = 0x80000000; @@ -58,11 +56,14 @@ class VirtualGPU : public device::VirtualDevice { Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator uint rtCU, //!< The number of reserved CUs amd::CommandQueue::Priority priority, //!< Queue priority - uint64_t residency_limit //!< Enables residency limit + uint64_t residency_limit, //!< Enables residency limit + uint max_command_buffers //!< Number of allocated command buffers ); - Queue(Pal::IDevice* palDev, uint64_t residency_limit) + Queue(Pal::IDevice* palDev, uint64_t residency_limit, uint max_command_buffers) : iQueue_(nullptr), + iCmdBuffs_(max_command_buffers, nullptr), + iCmdFences_(max_command_buffers, nullptr), last_kernel_(nullptr), iDev_(palDev), cmdBufIdSlot_(StartCmdBufIdx), @@ -71,12 +72,9 @@ class VirtualGPU : public device::VirtualDevice { cmdCnt_(0), vlAlloc_(64 * Ki), residency_size_(0), - residency_limit_(residency_limit) + residency_limit_(residency_limit), + max_command_buffers_(max_command_buffers) { - for (uint i = 0; i < MaxCmdBuffers; ++i) { - iCmdBuffs_[i] = nullptr; - iCmdFences_[i] = nullptr; - } vlAlloc_.Init(); } @@ -152,8 +150,8 @@ class VirtualGPU : public device::VirtualDevice { uint cmdBufId() const { return cmdBufIdCurrent_; } Pal::IQueue* iQueue_; //!< PAL queue object - Pal::ICmdBuffer* iCmdBuffs_[MaxCmdBuffers]; //!< PAL command buffers - Pal::IFence* iCmdFences_[MaxCmdBuffers]; //!< PAL fences, associated with CMD + std::vector iCmdBuffs_; //!< PAL command buffers + std::vector iCmdFences_; //!< PAL fences, associated with CMD const amd::Kernel* last_kernel_; //!< Last submitted kernel private: @@ -172,6 +170,7 @@ class VirtualGPU : public device::VirtualDevice { std::vector palSdiRefs_; uint64_t residency_size_; //!< Resource residency size uint64_t residency_limit_; //!< Enables residency limit + uint max_command_buffers_; }; struct CommandBatch : public amd::HeapObject { diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp index eb29a04ed4..ea6a98f1e5 100644 --- a/projects/clr/rocclr/runtime/utils/flags.hpp +++ b/projects/clr/rocclr/runtime/utils/flags.hpp @@ -217,6 +217,8 @@ release_on_stg(uint, PAL_RGP_DISP_COUNT, 10, \ "The number of dispatches for RGP capture with SQTT") \ release(bool, GPU_FORCE_WAVE_SIZE_32, false, \ "Forces WaveSize32 compilation in SC") \ +release(uint, GPU_MAX_COMMAND_BUFFERS, 8, \ + "The maximum number of command buffers allocated per queue") \ namespace amd {