P4 to Git Change 1538923 by vsytchen@vsytchen-ocl-win10 on 2018/04/10 12:18:20
SWDEV-150166 - [OCL][LC][Ellesmere][Windows] oclcompiler.OCLStructOperations[30] causing crash
1. Add runtime flag GPU_MAX_COMMAND_BUFFERS to set the amount of command buffers allocated per queue for pal devices.
ReviewBoardURL = http://ocltc.amd.com/reviews/r/14572/diff/
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#49 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#18 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#88 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#47 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#288 edit
[ROCm/clr commit: 19332a7d5e]
This commit is contained in:
@@ -141,6 +141,9 @@ Settings::Settings() {
|
||||
subAllocationChunkSize_ = 64 * Mi;
|
||||
subAllocationMaxSize_ =
|
||||
std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
|
||||
|
||||
// Note: More command buffers may cause a HW hang with HWSC on VI family in OCLPerfKernelArguments
|
||||
maxCmdBuffers_ = 8;
|
||||
}
|
||||
|
||||
bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
@@ -495,6 +498,10 @@ void Settings::override() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!flagIsDefault(GPU_MAX_COMMAND_BUFFERS)) {
|
||||
maxCmdBuffers_ = GPU_MAX_COMMAND_BUFFERS;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
|
||||
@@ -95,6 +95,7 @@ class Settings : public device::Settings {
|
||||
size_t numMemDependencies_; //!< The array size for memory dependencies tracking
|
||||
uint64_t maxAllocSize_; //!< Maximum single allocation size
|
||||
uint rgpSqttDispCount_; //!< The number of dispatches captured in SQTT
|
||||
uint maxCmdBuffers_; //!< Maximum number of command buffers allocated per queue
|
||||
|
||||
uint64_t subAllocationMinSize_; //!< Minimum size allowed for suballocations
|
||||
uint64_t subAllocationMaxSize_; //!< Maximum size allowed with suballocations
|
||||
|
||||
@@ -36,7 +36,7 @@ namespace pal {
|
||||
VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueType queueType,
|
||||
uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
|
||||
uint rtCU, amd::CommandQueue::Priority priority,
|
||||
uint64_t residency_limit) {
|
||||
uint64_t residency_limit, uint max_command_buffers) {
|
||||
Pal::Result result;
|
||||
Pal::CmdBufferCreateInfo cmdCreateInfo = {};
|
||||
Pal::QueueCreateInfo qCreateInfo = {};
|
||||
@@ -81,8 +81,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
size_t allocSize = qSize + MaxCmdBuffers * (cmdSize + fSize);
|
||||
VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit);
|
||||
size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
|
||||
VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit, max_command_buffers);
|
||||
if (queue != nullptr) {
|
||||
address addrQ = reinterpret_cast<address>(&queue[1]);
|
||||
// Create PAL queue object
|
||||
@@ -93,10 +93,10 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp
|
||||
}
|
||||
queue->UpdateAppPowerProfile();
|
||||
address addrCmd = addrQ + qSize;
|
||||
address addrF = addrCmd + MaxCmdBuffers * cmdSize;
|
||||
address addrF = addrCmd + max_command_buffers * cmdSize;
|
||||
Pal::CmdBufferBuildInfo cmdBuildInfo = {};
|
||||
|
||||
for (uint i = 0; i < MaxCmdBuffers; ++i) {
|
||||
for (uint i = 0; i < max_command_buffers; ++i) {
|
||||
result = palDev->CreateCmdBuffer(cmdCreateInfo, &addrCmd[i * cmdSize], &queue->iCmdBuffs_[i]);
|
||||
if (result != Pal::Result::Success) {
|
||||
delete queue;
|
||||
@@ -133,7 +133,7 @@ VirtualGPU::Queue::~Queue() {
|
||||
}
|
||||
memReferences_.clear();
|
||||
|
||||
for (uint i = 0; i < MaxCmdBuffers; ++i) {
|
||||
for (uint i = 0; i < max_command_buffers_; ++i) {
|
||||
if (nullptr != iCmdBuffs_[i]) {
|
||||
iCmdBuffs_[i]->Destroy();
|
||||
}
|
||||
@@ -274,14 +274,14 @@ bool VirtualGPU::Queue::flush() {
|
||||
}
|
||||
|
||||
// Wrap current slot
|
||||
cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers;
|
||||
cmdBufIdSlot_ = cmdBufIdCurrent_ % max_command_buffers_;
|
||||
|
||||
waifForFence<IbReuse>(cmdBufIdSlot_);
|
||||
|
||||
// Progress retired TS
|
||||
if ((cmdBufIdCurrent_ > MaxCmdBuffers) &&
|
||||
(cmbBufIdRetired_ < (cmdBufIdCurrent_ - MaxCmdBuffers))) {
|
||||
cmbBufIdRetired_ = cmdBufIdCurrent_ - MaxCmdBuffers;
|
||||
if ((cmdBufIdCurrent_ > max_command_buffers_) &&
|
||||
(cmbBufIdRetired_ < (cmdBufIdCurrent_ - max_command_buffers_))) {
|
||||
cmbBufIdRetired_ = cmdBufIdCurrent_ - max_command_buffers_;
|
||||
}
|
||||
|
||||
// Reset command buffer, so CB chunks could be reused
|
||||
@@ -326,7 +326,7 @@ bool VirtualGPU::Queue::waitForEvent(uint id) {
|
||||
return true;
|
||||
}
|
||||
|
||||
uint slotId = id % MaxCmdBuffers;
|
||||
uint slotId = id % max_command_buffers_;
|
||||
constexpr bool IbReuse = true;
|
||||
bool result = waifForFence<!IbReuse>(slotId);
|
||||
cmbBufIdRetired_ = id;
|
||||
@@ -343,7 +343,7 @@ bool VirtualGPU::Queue::isDone(uint id) {
|
||||
flush();
|
||||
}
|
||||
|
||||
if (Pal::Result::Success != iCmdFences_[id % MaxCmdBuffers]->GetStatus()) {
|
||||
if (Pal::Result::Success != iCmdFences_[id % max_command_buffers_]->GetStatus()) {
|
||||
return false;
|
||||
}
|
||||
cmbBufIdRetired_ = id;
|
||||
@@ -784,6 +784,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
uint idx = index() % (dev().numComputeEngines() - firstQueue);
|
||||
uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs ? 0 :
|
||||
(dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
|
||||
uint max_cmd_buffers = dev().settings().maxCmdBuffers_;
|
||||
|
||||
if (dev().numComputeEngines()) {
|
||||
//! @todo There is a hang with a mix of user and non user queues.
|
||||
@@ -795,7 +796,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
|
||||
queues_[MainEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue,
|
||||
cmdAllocator_, rtCUs, priority,
|
||||
residency_limit);
|
||||
residency_limit, max_cmd_buffers);
|
||||
if (nullptr == queues_[MainEngine]) {
|
||||
return false;
|
||||
}
|
||||
@@ -813,14 +814,14 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
queues_[SdmaEngine] =
|
||||
Queue::Create(dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_,
|
||||
amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal,
|
||||
residency_limit);
|
||||
residency_limit, max_cmd_buffers);
|
||||
if (nullptr == queues_[SdmaEngine]) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
queues_[SdmaEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute,
|
||||
idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal,
|
||||
residency_limit);
|
||||
residency_limit, max_cmd_buffers);
|
||||
if (nullptr == queues_[SdmaEngine]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -41,8 +41,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
public:
|
||||
class Queue : public amd::HeapObject {
|
||||
public:
|
||||
// Note: More command buffers may cause a HW hang with HWSC on VI family in OCLPerfKernelArguments
|
||||
static const uint MaxCmdBuffers = 8;
|
||||
static const uint MaxCommands = 256;
|
||||
static const uint StartCmdBufIdx = 1;
|
||||
static const uint FirstMemoryReference = 0x80000000;
|
||||
@@ -58,11 +56,14 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
|
||||
uint rtCU, //!< The number of reserved CUs
|
||||
amd::CommandQueue::Priority priority, //!< Queue priority
|
||||
uint64_t residency_limit //!< Enables residency limit
|
||||
uint64_t residency_limit, //!< Enables residency limit
|
||||
uint max_command_buffers //!< Number of allocated command buffers
|
||||
);
|
||||
|
||||
Queue(Pal::IDevice* palDev, uint64_t residency_limit)
|
||||
Queue(Pal::IDevice* palDev, uint64_t residency_limit, uint max_command_buffers)
|
||||
: iQueue_(nullptr),
|
||||
iCmdBuffs_(max_command_buffers, nullptr),
|
||||
iCmdFences_(max_command_buffers, nullptr),
|
||||
last_kernel_(nullptr),
|
||||
iDev_(palDev),
|
||||
cmdBufIdSlot_(StartCmdBufIdx),
|
||||
@@ -71,12 +72,9 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
cmdCnt_(0),
|
||||
vlAlloc_(64 * Ki),
|
||||
residency_size_(0),
|
||||
residency_limit_(residency_limit)
|
||||
residency_limit_(residency_limit),
|
||||
max_command_buffers_(max_command_buffers)
|
||||
{
|
||||
for (uint i = 0; i < MaxCmdBuffers; ++i) {
|
||||
iCmdBuffs_[i] = nullptr;
|
||||
iCmdFences_[i] = nullptr;
|
||||
}
|
||||
vlAlloc_.Init();
|
||||
}
|
||||
|
||||
@@ -152,8 +150,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
uint cmdBufId() const { return cmdBufIdCurrent_; }
|
||||
|
||||
Pal::IQueue* iQueue_; //!< PAL queue object
|
||||
Pal::ICmdBuffer* iCmdBuffs_[MaxCmdBuffers]; //!< PAL command buffers
|
||||
Pal::IFence* iCmdFences_[MaxCmdBuffers]; //!< PAL fences, associated with CMD
|
||||
std::vector<Pal::ICmdBuffer*> iCmdBuffs_; //!< PAL command buffers
|
||||
std::vector<Pal::IFence*> iCmdFences_; //!< PAL fences, associated with CMD
|
||||
const amd::Kernel* last_kernel_; //!< Last submitted kernel
|
||||
|
||||
private:
|
||||
@@ -172,6 +170,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
std::vector<const Pal::IGpuMemory*> palSdiRefs_;
|
||||
uint64_t residency_size_; //!< Resource residency size
|
||||
uint64_t residency_limit_; //!< Enables residency limit
|
||||
uint max_command_buffers_;
|
||||
};
|
||||
|
||||
struct CommandBatch : public amd::HeapObject {
|
||||
|
||||
@@ -217,6 +217,8 @@ release_on_stg(uint, PAL_RGP_DISP_COUNT, 10, \
|
||||
"The number of dispatches for RGP capture with SQTT") \
|
||||
release(bool, GPU_FORCE_WAVE_SIZE_32, false, \
|
||||
"Forces WaveSize32 compilation in SC") \
|
||||
release(uint, GPU_MAX_COMMAND_BUFFERS, 8, \
|
||||
"The maximum number of command buffers allocated per queue") \
|
||||
|
||||
|
||||
namespace amd {
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user