Blit SDMA support for gfx70x

Change-Id: Ie6f215890553ef41c3f36b349fc9cc39c2d38747 [ROCm/ROCR-Runtime commit: 103cd04236]
2016-05-31 13:38:45 -05:00
parent f937d533f6
commit 7b50eacba5
5 changed files with 54 additions and 14 deletions
@@ -227,6 +227,9 @@ class BlitSdma : public core::Blit {

  /// Max total fill count supported by the queue.
  size_t max_total_fill_size_;
+
+  /// True if platform atomic is supported.
+  bool platform_atomic_support_;
 };
 }  // namespace amd

@@ -79,6 +79,11 @@ class GpuAgentInt : public core::Agent {
  // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful.
  virtual void InitDma() = 0;

+  // @brief Initialize blit kernel object based on AQL queue.
+  //
+  // @retval HSA_STATUS_SUCCESS blit kernel object initialization is successful.
+  virtual hsa_status_t InitBlitKernel() = 0;
+
  // @brief Invoke the user provided callback for each region accessible by
  // this agent.
  //
@@ -178,10 +183,8 @@ class GpuAgent : public GpuAgentInt {
  // @brief Override from core::Agent.
  void InitDma() override;

-  // @brief Initialize blit kernel object based on AQL queue.
-  //
-  // @retval HSA_STATUS_SUCCESS blit kernel object initialization is successful.
-  hsa_status_t InitBlitKernel();
+  // @brief Override from core::Agent.
+  hsa_status_t InitBlitKernel() override;

  uint16_t GetMicrocodeVersion() const;

@@ -365,7 +365,8 @@ BlitSdma::BlitSdma()
      fence_pool_size_(0),
      fence_pool_counter_(0),
      cached_reserve_offset_(0),
-      cached_commit_offset_(0) {
+      cached_commit_offset_(0),
+      platform_atomic_support_(true) {
  std::memset(&queue_resource_, 0, sizeof(queue_resource_));
 }

@@ -418,6 +419,10 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
    return HSA_STATUS_ERROR;
  }

+  if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1)) {
+    platform_atomic_support_ = false;
+  }
+
  // Allocate queue buffer.
  queue_size_ = kQueueSize;

@@ -568,7 +573,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
  const uint32_t total_copy_command_size =
      num_copy_command * linear_copy_command_size_;

-  // In case the user disable or enable the profiling in the middle of the call.
+  // Load the profiling state early in case the user disable or enable the
+  // profiling in the middle of the call.
  const bool profiling_enabled = agent_->profiling_enabled();

  uint64_t* end_ts_addr = NULL;
@@ -589,8 +595,20 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
        (2 * timestamp_command_size_) + linear_copy_command_size_;
  }

+  // On agent that does not support platform atomic, we replace it with
+  // one or two fence packet(s) to update the signal value. The reason fence
+  // is used and not write packet is because the SDMA engine may overlap a
+  // serial copy/write packets.
+  const uint64_t completion_signal_value =
+      static_cast<uint64_t>(out_signal.LoadRelaxed() - 1);
+  const size_t sync_command_size = (platform_atomic_support_)
+                                       ? atomic_command_size_
+                                       : (completion_signal_value > UINT32_MAX)
+                                             ? 2 * fence_command_size_
+                                             : fence_command_size_;
+
  const uint32_t total_command_size =
-      total_poll_command_size + total_copy_command_size + atomic_command_size_ +
+      total_poll_command_size + total_copy_command_size + sync_command_size +
      total_timestamp_command_size;

  char* command_addr = AcquireWriteAddress(total_command_size);
@@ -635,7 +653,20 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
  }

  // After transfer is completed, decrement the signal.
-  BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
+  if (platform_atomic_support_) {
+    BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
+  } else {
+    uint32_t* signal_value_location =
+        reinterpret_cast<uint32_t*>(out_signal.ValueLocation());
+    if (completion_signal_value > UINT32_MAX) {
+      BuildFenceCommand(command_addr, signal_value_location + 1,
+                        static_cast<uint32_t>(completion_signal_value >> 32));
+      command_addr += fence_command_size_;
+    }
+
+    BuildFenceCommand(command_addr, signal_value_location,
+                      static_cast<uint32_t>(completion_signal_value));
+  }

  ReleaseWriteAddress(command_addr_temp, total_command_size);

@@ -78,12 +78,6 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) {
  GpuAgent* gpu = new GpuAgent(node_id, node_prop);
  core::Runtime::runtime_singleton_->RegisterAgent(gpu);

-  if (HSA_STATUS_SUCCESS != gpu->InitBlitKernel()) {
-    assert(false && "Fail init blit");
-    delete gpu;
-    gpu = NULL;
-  }
-
  return gpu;
 }

@@ -430,6 +430,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
      [](void* dst, const void* src, size_t size,
         std::vector<core::Signal*> dep_signals,
         core::Signal* completion_signal, bool profiling_enabled) {
+
        for (core::Signal* dep : dep_signals) {
          dep->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
                           HSA_WAIT_STATE_BLOCKED);
@@ -820,6 +821,14 @@ void Runtime::Load() {

  // Load tools libraries
  LoadTools();
+
+  // Initialize blit kernel object after tools is initialized to allow tools
+  // to overload blit kernel.
+  for (core::Agent* agent : gpu_agents_) {
+    const hsa_status_t stat =
+        reinterpret_cast<amd::GpuAgentInt*>(agent)->InitBlitKernel();
+    assert(HSA_STATUS_SUCCESS == stat);
+  }
 }

 void Runtime::Unload() {