SWDEV-539130 - Log blit copy duration (#258)

Co-authored-by: Pengda Xie <pengda.xie@amd.com>
2025-09-03 10:01:47 -07:00
Commit 83a10986a4
@@ -71,10 +71,8 @@ class BlitKernel : public core::Blit {
  ///
  /// @note: The call will block until all AQL packets have been executed.
  ///
-  /// @param agent Agent passed to Initialize.
-  ///
  /// @return hsa_status_t
-  virtual hsa_status_t Destroy(const core::Agent& agent) override;
+  virtual hsa_status_t Destroy() override;

  /// @brief Submit an AQL packet to perform vector copy. The call is blocking
  /// until the command execution is finished.
@@ -189,6 +187,8 @@ class BlitKernel : public core::Blit {

  std::map<KernelType, KernelCode> kernels_;

+  const core::Agent* agent_;
+
  /// AQL queue for submitting the vector copy kernel.
  core::Queue* queue_;
  uint32_t queue_bitmask_;
@@ -94,10 +94,8 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {
  ///
  /// @note: The call will block until all packets have executed.
  ///
-  /// @param agent Agent passed to Initialize.
-  ///
  /// @return hsa_status_t
-  virtual hsa_status_t Destroy(const core::Agent& agent) override;
+  virtual hsa_status_t Destroy() override;

  /// @brief Submit a linear copy command to the queue buffer.
  ///
@@ -63,7 +63,7 @@ class Blit {
  /// @param agent Agent passed to Initialize.
  ///
  /// @return hsa_status_t
-  virtual hsa_status_t Destroy(const core::Agent& agent) = 0;
+  virtual hsa_status_t Destroy() = 0;

  /// @brief Submit a linear copy command to the the underlying compute device's
  /// control block. The call is blocking until the command execution is
@@ -551,6 +551,7 @@ BlitKernel::BlitKernel(core::Queue* queue)
 BlitKernel::~BlitKernel() {}

 hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
+  agent_ = &agent;
  queue_bitmask_ = queue_->public_handle()->size - 1;

  bytes_written_.resize(queue_->public_handle()->size);
@@ -561,15 +562,15 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
    return status;
  }

-  const AMD::GpuAgent& gpuAgent = static_cast<const AMD::GpuAgent&>(agent);
+  const AMD::GpuAgent* gpuAgent = static_cast<const AMD::GpuAgent*>(agent_);
  kernarg_async_ = reinterpret_cast<KernelArgs*>(
-      gpuAgent.system_allocator()(queue_->public_handle()->size * AlignUp(sizeof(KernelArgs), 16),
+      gpuAgent->system_allocator()(queue_->public_handle()->size * AlignUp(sizeof(KernelArgs), 16),
                                  16, core::MemoryRegion::AllocateNoFlags));

  kernarg_async_mask_ = queue_->public_handle()->size - 1;

  // Obtain the number of compute units in the underlying agent.
-  num_cus_ = gpuAgent.properties().NumFComputeCores / 4;
+  num_cus_ = gpuAgent->properties().NumFComputeCores / 4;

  // Assemble shaders to AQL code objects.
  std::map<KernelType, const char*> kernel_names = {
@@ -579,29 +580,29 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {

  for (auto kernel_name : kernel_names) {
    KernelCode& kernel = kernels_[kernel_name.first];
-    gpuAgent.AssembleShader(kernel_name.second, AMD::GpuAgent::AssembleTarget::AQL, kernel.code_buf_,
+    gpuAgent->AssembleShader(kernel_name.second, AMD::GpuAgent::AssembleTarget::AQL, kernel.code_buf_,
                            kernel.code_buf_size_);
  }

-  if (agent.profiling_enabled()) {
+  if (agent_->profiling_enabled()) {
    return EnableProfiling(true);
  }

  return HSA_STATUS_SUCCESS;
 }

-hsa_status_t BlitKernel::Destroy(const core::Agent& agent) {
+hsa_status_t BlitKernel::Destroy() {
  std::lock_guard<std::mutex> guard(lock_);

-  const AMD::GpuAgent& gpuAgent = static_cast<const AMD::GpuAgent&>(agent);
+  const AMD::GpuAgent* gpuAgent = static_cast<const AMD::GpuAgent*>(agent_);

  for (auto kernel_pair : kernels_) {
-    gpuAgent.ReleaseShader(kernel_pair.second.code_buf_,
+    gpuAgent->ReleaseShader(kernel_pair.second.code_buf_,
                           kernel_pair.second.code_buf_size_);
  }

  if (kernarg_async_ != NULL) {
-    gpuAgent.system_deallocator()(kernarg_async_);
+    gpuAgent->system_deallocator()(kernarg_async_);
  }

  if (completion_signal_.handle != 0) {
@@ -635,6 +636,11 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
    return HSA_STATUS_ERROR;
  }

+  if(agent_->profiling_enabled()) {
+    LogSignalDuration(HSA_AMD_LOG_FLAG_BLIT_KERNEL_PKTS, completion_signal_,
+                      "BlitKernel::SubmitLinearCopyCommand");
+  }
+
  return HSA_STATUS_SUCCESS;
 }

@@ -171,7 +171,7 @@ hsa_status_t BlitSdma<useGCR>::Initialize(const core::Agent& agent, bool use_xgm
  if (queue_start_addr_ == NULL) {
    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
  }
-  MAKE_NAMED_SCOPE_GUARD(cleanupOnException, [&]() { Destroy(agent); };);
+  MAKE_NAMED_SCOPE_GUARD(cleanupOnException, [&]() { Destroy(); };);
  std::memset(queue_start_addr_, 0, kQueueSize);

  bytes_written_.resize(kQueueSize);
@@ -208,7 +208,7 @@ hsa_status_t BlitSdma<useGCR>::Initialize(const core::Agent& agent, bool use_xgm
  return HSA_STATUS_SUCCESS;
 }

-template <bool useGCR> hsa_status_t BlitSdma<useGCR>::Destroy(const core::Agent& agent) {
+template <bool useGCR> hsa_status_t BlitSdma<useGCR>::Destroy() {
  // Release all allocated resources and reset them to zero.

  if (queue_resource_.QueueId != 0) {
@@ -738,7 +738,7 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi, int rec_eng) {
  rec_eng = uses_rec_sdma_eng_id_mask_ || !use_xgmi ? rec_eng : -1;

  if (sdma->Initialize(*this, use_xgmi, copy_size_override, rec_eng) != HSA_STATUS_SUCCESS) {
-    sdma->Destroy(*this);
+    sdma->Destroy();
    delete sdma;
    sdma = nullptr;
  }
@@ -750,7 +750,7 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) {
  AMD::BlitKernel* kernl = new AMD::BlitKernel(queue);

  if (kernl->Initialize(*this) != HSA_STATUS_SUCCESS) {
-    kernl->Destroy(*this);
+    kernl->Destroy();
    delete kernl;
    kernl = NULL;
  }
@@ -912,7 +912,7 @@ void GpuAgent::ReleaseResources() {
    this->Disable();
    for (auto& blit : blits_) {
      if (!blit.empty()) {
-        hsa_status_t status = blit->Destroy(*this);
+        hsa_status_t status = blit->Destroy();
        assert(status == HSA_STATUS_SUCCESS);
      }
    }
@@ -151,6 +151,16 @@ static __forceinline unsigned long long int strtoull(const char* str,
      rocr::log_printf(__FILENAME__, __LINE__, format, ##__VA_ARGS__);                             \
  } while (false);

+#define LogSignalDuration(flag, signal, msg)                                                       \
+  do {                                                                                             \
+    if (hsa_flag_isset64(log_flags, flag)) {                                                       \
+      amd_signal_t* amd_signal = reinterpret_cast<amd_signal_t*>(signal.handle);                   \
+      rocr::log_printf(__FILENAME__, __LINE__,                                                     \
+        "%s Signal = (0x%lx), ticks start/end = %lu / %lu, Ticks elapsed = %lu", msg, signal,      \
+        amd_signal->start_ts, amd_signal->end_ts, amd_signal->end_ts - amd_signal->start_ts);      \
+    }                                                                                              \
+  } while (false);
+
 // A macro to remove unused variable warnings
 #define UNUSED(x) (void)(x)