Sequence queue error callbacks with queue destroy.

HSA v1.2 update. Change-Id: I13975e71b2c1ea5b7738236f5d02df84312ad00c
2018-02-28 05:56:39 -06:00
parent c93584e725
commit 6df9ba97ce
4 changed files with 90 additions and 90 deletions
@@ -204,12 +204,13 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
  void CloseRingBufferFD(const char* ring_buf_shm_path, int fd) const;
  int CreateRingBufferFD(const char* ring_buf_shm_path, uint32_t ring_buf_phys_size_bytes) const;

-  static bool DynamicScratchHandler(hsa_signal_value_t error_code, void* arg);
-
  /// @brief Define the Scratch Buffer Descriptor and related parameters
  /// that enable kernel access scratch memory
  void InitScratchSRD();

+  /// @brief Handler for hardware queue events.
+  static bool DynamicScratchHandler(hsa_signal_value_t error_code, void* arg);
+
  // AQL packet ring buffer
  void* ring_buf_;

@@ -246,6 +247,10 @@ class AqlQueue : public core::Queue, private core::LocalSignal, public core::Doo
  uint32_t pm4_ib_size_b_;
  KernelMutex pm4_ib_mutex_;

+  // Error handler control variable.
+  std::atomic<uint32_t> dynamicScratchState;
+  enum { ERROR_HANDLER_DONE = 1, ERROR_HANDLER_TERMINATE = 2 };
+
  // Shared event used for queue errors
  static HsaEvent* queue_event_;

@@ -77,7 +77,7 @@ class GpuAgentInt : public core::Agent {
      : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {}

  // @brief Ensure blits are ready (performance hint).
-  virtual void PreloadBlits(){};
+  virtual void PreloadBlits() {}

  // @brief Initialization hook invoked after tools library has loaded,
  // to allow tools interception of interface functions.
@@ -72,7 +72,7 @@ namespace amd {
 // Queue::amd_queue_ is cache-aligned for performance.
 const uint32_t kAmdQueueAlignBytes = 0x40;

-HsaEvent* AqlQueue::queue_event_ = NULL;
+HsaEvent* AqlQueue::queue_event_ = nullptr;
 std::atomic<uint32_t> AqlQueue::queue_count_(0);
 KernelMutex AqlQueue::queue_lock_;
 int AqlQueue::rtti_id_ = 0;
@@ -92,7 +92,8 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
      errors_data_(err_data),
      is_kv_queue_(is_kv),
      pm4_ib_buf_(nullptr),
-      pm4_ib_size_b_(0x1000) {
+      pm4_ib_size_b_(0x1000),
+      dynamicScratchState(0) {
  // When queue_full_workaround_ is set to 1, the ring buffer is internally
  // doubled in size. Virtual addresses in the upper half of the ring allocation
  // are mapped to the same set of pages backing the lower half.
@@ -279,8 +280,18 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
 }

 AqlQueue::~AqlQueue() {
-  Inactivate();
+  // Remove error handler synchronously.
+  // Sequences error handler callbacks with queue destroy.
+  dynamicScratchState |= ERROR_HANDLER_TERMINATE;
+  HSA::hsa_signal_store_screlease(amd_queue_.queue_inactive_signal, 0x8000000000000000ull);
+  while ((dynamicScratchState & ERROR_HANDLER_DONE) != ERROR_HANDLER_DONE) {
+    HSA::hsa_signal_wait_relaxed(amd_queue_.queue_inactive_signal, HSA_SIGNAL_CONDITION_NE,
+                                 0x8000000000000000ull, -1ull, HSA_WAIT_STATE_BLOCKED);
+    HSA::hsa_signal_store_relaxed(amd_queue_.queue_inactive_signal, 0x8000000000000000ull);
+  }

+  auto err = hsaKmtDestroyQueue(queue_id_);
+  assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtDestroyQueue failed.");
  FreeRegisteredRingBuffer();
  agent_->ReleaseQueueScratch(queue_scratch_.queue_base);
  HSA::hsa_signal_destroy(amd_queue_.queue_inactive_signal);
@@ -289,7 +300,7 @@ AqlQueue::~AqlQueue() {
    queue_count_--;
    if (queue_count_ == 0) {
      core::InterruptSignal::DestroyEvent(queue_event_);
-      queue_event_ = NULL;
+      queue_event_ = nullptr;
    }
  }
  core::Runtime::runtime_singleton_->system_deallocator()(pm4_ib_buf_);
@@ -678,107 +689,88 @@ int AqlQueue::CreateRingBufferFD(const char* ring_buf_shm_path,
 hsa_status_t AqlQueue::Inactivate() {
  bool active = active_.exchange(false, std::memory_order_relaxed);
  if (active) {
-    auto err = hsaKmtDestroyQueue(this->queue_id_);
-    assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtDestroyQueue failed.");
+    auto err = hsaKmtUpdateQueue(queue_id_, 0, HSA_QUEUE_PRIORITY_NORMAL, NULL, 0, NULL);
+    assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtUpdateQueue failed.");
  }
  return HSA_STATUS_SUCCESS;
 }

 bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
  AqlQueue* queue = (AqlQueue*)arg;
+  hsa_status_t errorCode = HSA_STATUS_SUCCESS;

-  if ((error_code & 1) == 1) {
-    // Insufficient scratch - recoverable
-    auto& scratch = queue->queue_scratch_;
+  // Process errors only if queue is not terminating.
+  if ((queue->dynamicScratchState & ERROR_HANDLER_TERMINATE) != ERROR_HANDLER_TERMINATE) {
+    // Process only one queue error, don't fall through.
+    if (error_code == 1) {
+      // Insufficient scratch - recoverable, don't process dynamic scratch if errors are present.
+      auto& scratch = queue->queue_scratch_;

-    queue->agent_->ReleaseQueueScratch(scratch.queue_base);
+      queue->agent_->ReleaseQueueScratch(scratch.queue_base);

-    uint64_t pkt_slot_idx = queue->amd_queue_.read_dispatch_id % queue->amd_queue_.hsa_queue.size;
+      uint64_t pkt_slot_idx = queue->amd_queue_.read_dispatch_id % queue->amd_queue_.hsa_queue.size;

-    const core::AqlPacket& pkt =
-        ((core::AqlPacket*)queue->amd_queue_.hsa_queue.base_address)[pkt_slot_idx];
+      const core::AqlPacket& pkt =
+          ((core::AqlPacket*)queue->amd_queue_.hsa_queue.base_address)[pkt_slot_idx];

-    uint32_t scratch_request = pkt.dispatch.private_segment_size;
+      uint32_t scratch_request = pkt.dispatch.private_segment_size;

-    scratch.size_per_thread =
-        Max(uint32_t(scratch.size_per_thread * 2), scratch_request);
-    // Align whole waves to 1KB.
-    scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16);
-    scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) *
-        queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize;
+      scratch.size_per_thread = Max(uint32_t(scratch.size_per_thread * 2), scratch_request);
+      // Align whole waves to 1KB.
+      scratch.size_per_thread = AlignUp(scratch.size_per_thread, 16);
+      scratch.size = scratch.size_per_thread * (queue->amd_queue_.max_cu_id + 1) *
+          queue->agent_->properties().MaxSlotsScratchCU * queue->agent_->properties().WaveFrontSize;

-    queue->agent_->AcquireQueueScratch(scratch);
-    if (scratch.queue_base == NULL) {
-      // Out of scratch - promote error and invalidate queue
-      queue->Inactivate();
-      if (queue->errors_callback_ != NULL)
-        queue->errors_callback_(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
-                                queue->public_handle(), queue->errors_data_);
-      return false;
+      queue->agent_->AcquireQueueScratch(scratch);
+
+      // Out of scratch - promote error
+      if (scratch.queue_base == NULL) errorCode = HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+      // Reset scratch memory related entities for the queue
+      queue->InitScratchSRD();
+
+    } else if ((error_code & 2) == 2) {  // Invalid dim
+      errorCode = HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
+
+    } else if ((error_code & 4) == 4) {  // Invalid group memory
+      errorCode = HSA_STATUS_ERROR_INVALID_ALLOCATION;
+
+    } else if ((error_code & 8) == 8) {  // Invalid (or NULL) code
+      errorCode = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
+
+    } else if (((error_code & 32) == 32) ||    // Invalid format: 32 is generic,
+               ((error_code & 256) == 256)) {  // 256 is vendor specific packets
+      errorCode = HSA_STATUS_ERROR_INVALID_PACKET_FORMAT;
+
+    } else if ((error_code & 64) == 64) {  // Group is too large
+      errorCode = HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+    } else if ((error_code & 128) == 128) {  // Out of VGPRs
+      errorCode = HSA_STATUS_ERROR_INVALID_ISA;
+
+    } else if ((error_code & 0x80000000) == 0x80000000) {  // Debug trap
+      errorCode = HSA_STATUS_ERROR_EXCEPTION;
+
+    } else {  // Undefined code
+      assert(false && "Undefined queue error code");
+      errorCode = HSA_STATUS_ERROR;
    }

-    // Reset scratch memory related entities for the queue
-    queue->InitScratchSRD();
+    if (errorCode == HSA_STATUS_SUCCESS) {
+      HSA::hsa_signal_store_relaxed(queue->amd_queue_.queue_inactive_signal, 0);
+      return true;
+    }

-  } else if ((error_code & 2) == 2) {  // Invalid dim
    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-
-  } else if ((error_code & 4) == 4) {  // Invalid group memory
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ALLOCATION,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-
-  } else if ((error_code & 8) == 8) {  // Invalid (or NULL) code
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_INVALID_CODE_OBJECT,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-
-  } else if (((error_code & 32) == 32) ||
-             ((error_code & 256) == 256)) {  // Invalid format: 32 is generic,
-                                             // 256 is vendor specific packets
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_INVALID_PACKET_FORMAT,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-  } else if ((error_code & 64) == 64) {  // Group is too large
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ARGUMENT,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-  } else if ((error_code & 128) == 128) {  // Out of VGPRs
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_INVALID_ISA,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-  } else if ((error_code & 0x80000000) == 0x80000000) {  // Debug trap
-    queue->Inactivate();
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR_EXCEPTION,
-                              queue->public_handle(), queue->errors_data_);
-    return false;
-  } else {
-    // Undefined code
-    queue->Inactivate();
-    assert(false && "Undefined queue error code");
-    if (queue->errors_callback_ != NULL)
-      queue->errors_callback_(HSA_STATUS_ERROR, queue->public_handle(),
-                              queue->errors_data_);
-    return false;
+    if (queue->errors_callback_ != nullptr)
+      queue->errors_callback_(errorCode, queue->public_handle(), queue->errors_data_);
  }
-
-  HSA::hsa_signal_store_relaxed(queue->amd_queue_.queue_inactive_signal, 0);
-  return true;
+  // Copy here is to protect against queue being released between setting the scratch state and
+  // updating the signal value.
+  hsa_signal_t signal = queue->amd_queue_.queue_inactive_signal;
+  queue->dynamicScratchState = ERROR_HANDLER_DONE;
+  HSA::hsa_signal_store_screlease(signal, -1ull);
+  return false;
 }

 hsa_status_t AqlQueue::SetCUMasking(const uint32_t num_cu_mask_count,
@@ -1222,6 +1222,9 @@ void Runtime::Unload() {
  amd::hsa::loader::Loader::Destroy(loader_);
  loader_ = nullptr;

+  std::for_each(gpu_agents_.begin(), gpu_agents_.end(), DeleteObject());
+  gpu_agents_.clear();
+
  async_events_control_.Shutdown();

  if (vm_fault_signal_ != nullptr) {