P4 to Git Change 1538923 by vsytchen@vsytchen-ocl-win10 on 2018/04/10 12:18:20

SWDEV-150166 - [OCL][LC][Ellesmere][Windows] oclcompiler.OCLStructOperations[30] causing crash 1. Add runtime flag GPU_MAX_COMMAND_BUFFERS to set the amount of command buffers allocated per queue for pal devices. ReviewBoardURL = http://ocltc.amd.com/reviews/r/14572/diff/ Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#49 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#18 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#88 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#47 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#288 edit [ROCm/clr commit: 19332a7d5e]
2018-04-10 12:34:33 -04:00
parent a9c0878566
commit dc5e6fb1e2
@@ -141,6 +141,9 @@ Settings::Settings() {
  subAllocationChunkSize_ = 64 * Mi;
  subAllocationMaxSize_ =
    std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
+
+  // Note: More command buffers may cause a HW hang with HWSC on VI family in OCLPerfKernelArguments
+  maxCmdBuffers_ = 8;
 }

 bool Settings::create(const Pal::DeviceProperties& palProp,
@@ -495,6 +498,10 @@ void Settings::override() {
        break;
    }
  }
+
+  if (!flagIsDefault(GPU_MAX_COMMAND_BUFFERS)) {
+    maxCmdBuffers_ = GPU_MAX_COMMAND_BUFFERS;
+  }
 }

 }  // namespace pal
@@ -95,6 +95,7 @@ class Settings : public device::Settings {
  size_t numMemDependencies_;  //!< The array size for memory dependencies tracking
  uint64_t maxAllocSize_;      //!< Maximum single allocation size
  uint rgpSqttDispCount_;      //!< The number of dispatches captured in SQTT
+  uint maxCmdBuffers_;         //!< Maximum number of command buffers allocated per queue

  uint64_t subAllocationMinSize_;   //!< Minimum size allowed for suballocations
  uint64_t subAllocationMaxSize_;   //!< Maximum size allowed with suballocations
@@ -36,7 +36,7 @@ namespace pal {
 VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueType queueType,
                                             uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
                                             uint rtCU, amd::CommandQueue::Priority priority,
-                                             uint64_t residency_limit) {
+                                             uint64_t residency_limit, uint max_command_buffers) {
  Pal::Result result;
  Pal::CmdBufferCreateInfo cmdCreateInfo = {};
  Pal::QueueCreateInfo qCreateInfo = {};
@@ -81,8 +81,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp
    return nullptr;
  }

-  size_t allocSize = qSize + MaxCmdBuffers * (cmdSize + fSize);
-  VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit);
+  size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
+  VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit, max_command_buffers);
  if (queue != nullptr) {
    address addrQ = reinterpret_cast<address>(&queue[1]);
    // Create PAL queue object
@@ -93,10 +93,10 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueTyp
    }
    queue->UpdateAppPowerProfile();
    address addrCmd = addrQ + qSize;
-    address addrF = addrCmd + MaxCmdBuffers * cmdSize;
+    address addrF = addrCmd + max_command_buffers * cmdSize;
    Pal::CmdBufferBuildInfo cmdBuildInfo = {};

-    for (uint i = 0; i < MaxCmdBuffers; ++i) {
+    for (uint i = 0; i < max_command_buffers; ++i) {
      result = palDev->CreateCmdBuffer(cmdCreateInfo, &addrCmd[i * cmdSize], &queue->iCmdBuffs_[i]);
      if (result != Pal::Result::Success) {
        delete queue;
@@ -133,7 +133,7 @@ VirtualGPU::Queue::~Queue() {
  }
  memReferences_.clear();

-  for (uint i = 0; i < MaxCmdBuffers; ++i) {
+  for (uint i = 0; i < max_command_buffers_; ++i) {
    if (nullptr != iCmdBuffs_[i]) {
      iCmdBuffs_[i]->Destroy();
    }
@@ -274,14 +274,14 @@ bool VirtualGPU::Queue::flush() {
  }

  // Wrap current slot
-  cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers;
+  cmdBufIdSlot_ = cmdBufIdCurrent_ % max_command_buffers_;

  waifForFence<IbReuse>(cmdBufIdSlot_);

  // Progress retired TS
-  if ((cmdBufIdCurrent_ > MaxCmdBuffers) &&
-      (cmbBufIdRetired_ < (cmdBufIdCurrent_ - MaxCmdBuffers))) {
-    cmbBufIdRetired_ = cmdBufIdCurrent_ - MaxCmdBuffers;
+  if ((cmdBufIdCurrent_ > max_command_buffers_) &&
+      (cmbBufIdRetired_ < (cmdBufIdCurrent_ - max_command_buffers_))) {
+    cmbBufIdRetired_ = cmdBufIdCurrent_ - max_command_buffers_;
  }

  // Reset command buffer, so CB chunks could be reused
@@ -326,7 +326,7 @@ bool VirtualGPU::Queue::waitForEvent(uint id) {
    return true;
  }

-  uint slotId = id % MaxCmdBuffers;
+  uint slotId = id % max_command_buffers_;
  constexpr bool IbReuse = true;
  bool result = waifForFence<!IbReuse>(slotId);
  cmbBufIdRetired_ = id;
@@ -343,7 +343,7 @@ bool VirtualGPU::Queue::isDone(uint id) {
    flush();
  }

-  if (Pal::Result::Success != iCmdFences_[id % MaxCmdBuffers]->GetStatus()) {
+  if (Pal::Result::Success != iCmdFences_[id % max_command_buffers_]->GetStatus()) {
    return false;
  }
  cmbBufIdRetired_ = id;
@@ -784,6 +784,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
  uint idx = index() % (dev().numComputeEngines() - firstQueue);
  uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs ? 0 :
    (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
+  uint max_cmd_buffers = dev().settings().maxCmdBuffers_;

  if (dev().numComputeEngines()) {
    //! @todo There is a hang with a mix of user and non user queues.
@@ -795,7 +796,7 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,

    queues_[MainEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue,
                                        cmdAllocator_, rtCUs, priority,
-                                        residency_limit);
+                                        residency_limit, max_cmd_buffers);
    if (nullptr == queues_[MainEngine]) {
      return false;
    }
@@ -813,14 +814,14 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
      queues_[SdmaEngine] =
          Queue::Create(dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_,
                        amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal,
-                        residency_limit);
+                        residency_limit, max_cmd_buffers);
      if (nullptr == queues_[SdmaEngine]) {
        return false;
      }
    } else {
        queues_[SdmaEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute,
            idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal,
-            residency_limit);
+            residency_limit, max_cmd_buffers);
        if (nullptr == queues_[SdmaEngine]) {
            return false;
        }
@@ -41,8 +41,6 @@ class VirtualGPU : public device::VirtualDevice {
 public:
  class Queue : public amd::HeapObject {
   public:
-    // Note: More command buffers may cause a HW hang with HWSC on VI family in OCLPerfKernelArguments
-    static const uint MaxCmdBuffers = 8;
    static const uint MaxCommands = 256;
    static const uint StartCmdBufIdx = 1;
    static const uint FirstMemoryReference = 0x80000000;
@@ -58,11 +56,14 @@ class VirtualGPU : public device::VirtualDevice {
                         Pal::ICmdAllocator* cmdAlloc,         //!< PAL CMD buffer allocator
                         uint rtCU,                            //!< The number of reserved CUs
                         amd::CommandQueue::Priority priority, //!< Queue priority
-                         uint64_t residency_limit              //!< Enables residency limit
+                         uint64_t residency_limit,             //!< Enables residency limit
+                         uint max_command_buffers              //!< Number of allocated command buffers
                         );

-    Queue(Pal::IDevice* palDev, uint64_t residency_limit)
+    Queue(Pal::IDevice* palDev, uint64_t residency_limit, uint max_command_buffers)
        : iQueue_(nullptr),
+          iCmdBuffs_(max_command_buffers, nullptr),
+          iCmdFences_(max_command_buffers, nullptr),
          last_kernel_(nullptr),
          iDev_(palDev),
          cmdBufIdSlot_(StartCmdBufIdx),
@@ -71,12 +72,9 @@ class VirtualGPU : public device::VirtualDevice {
          cmdCnt_(0),
          vlAlloc_(64 * Ki),
          residency_size_(0),
-          residency_limit_(residency_limit)
+          residency_limit_(residency_limit),
+          max_command_buffers_(max_command_buffers)
    {
-      for (uint i = 0; i < MaxCmdBuffers; ++i) {
-        iCmdBuffs_[i] = nullptr;
-        iCmdFences_[i] = nullptr;
-      }
      vlAlloc_.Init();
    }

@@ -152,8 +150,8 @@ class VirtualGPU : public device::VirtualDevice {
    uint cmdBufId() const { return cmdBufIdCurrent_; }

    Pal::IQueue* iQueue_;                        //!< PAL queue object
-    Pal::ICmdBuffer* iCmdBuffs_[MaxCmdBuffers];  //!< PAL command buffers
-    Pal::IFence* iCmdFences_[MaxCmdBuffers];     //!< PAL fences, associated with CMD
+    std::vector<Pal::ICmdBuffer*> iCmdBuffs_;    //!< PAL command buffers
+    std::vector<Pal::IFence*> iCmdFences_;       //!< PAL fences, associated with CMD
    const amd::Kernel* last_kernel_;             //!< Last submitted kernel

  private:
@@ -172,6 +170,7 @@ class VirtualGPU : public device::VirtualDevice {
    std::vector<const Pal::IGpuMemory*>   palSdiRefs_;
    uint64_t  residency_size_;  //!< Resource residency size
    uint64_t  residency_limit_; //!< Enables residency limit
+    uint max_command_buffers_;
  };

  struct CommandBatch : public amd::HeapObject {
@@ -217,6 +217,8 @@ release_on_stg(uint, PAL_RGP_DISP_COUNT, 10,                                  \
        "The number of dispatches for RGP capture with SQTT")                 \
 release(bool, GPU_FORCE_WAVE_SIZE_32, false,                                  \
        "Forces WaveSize32 compilation in SC")                                \
+release(uint, GPU_MAX_COMMAND_BUFFERS, 8,                                     \
+         "The maximum number of command buffers allocated per queue")         \


 namespace amd {