diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_kernel.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_kernel.h index 077b9782b1..f37821c5fb 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_kernel.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_kernel.h @@ -71,10 +71,8 @@ class BlitKernel : public core::Blit { /// /// @note: The call will block until all AQL packets have been executed. /// - /// @param agent Agent passed to Initialize. - /// /// @return hsa_status_t - virtual hsa_status_t Destroy(const core::Agent& agent) override; + virtual hsa_status_t Destroy() override; /// @brief Submit an AQL packet to perform vector copy. The call is blocking /// until the command execution is finished. @@ -189,6 +187,8 @@ class BlitKernel : public core::Blit { std::map kernels_; + const core::Agent* agent_; + /// AQL queue for submitting the vector copy kernel. core::Queue* queue_; uint32_t queue_bitmask_; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h index 82d9dbeb16..d6fbc2085e 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h @@ -94,10 +94,8 @@ template class BlitSdma : public BlitSdmaBase { /// /// @note: The call will block until all packets have executed. /// - /// @param agent Agent passed to Initialize. - /// /// @return hsa_status_t - virtual hsa_status_t Destroy(const core::Agent& agent) override; + virtual hsa_status_t Destroy() override; /// @brief Submit a linear copy command to the queue buffer. /// diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/blit.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/blit.h index f2ba647910..ec78e8adac 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/blit.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/blit.h @@ -63,7 +63,7 @@ class Blit { /// @param agent Agent passed to Initialize. /// /// @return hsa_status_t - virtual hsa_status_t Destroy(const core::Agent& agent) = 0; + virtual hsa_status_t Destroy() = 0; /// @brief Submit a linear copy command to the the underlying compute device's /// control block. The call is blocking until the command execution is diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp index 4efbe2dfeb..64f6599b8d 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp @@ -551,6 +551,7 @@ BlitKernel::BlitKernel(core::Queue* queue) BlitKernel::~BlitKernel() {} hsa_status_t BlitKernel::Initialize(const core::Agent& agent) { + agent_ = &agent; queue_bitmask_ = queue_->public_handle()->size - 1; bytes_written_.resize(queue_->public_handle()->size); @@ -561,15 +562,15 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) { return status; } - const AMD::GpuAgent& gpuAgent = static_cast(agent); + const AMD::GpuAgent* gpuAgent = static_cast(agent_); kernarg_async_ = reinterpret_cast( - gpuAgent.system_allocator()(queue_->public_handle()->size * AlignUp(sizeof(KernelArgs), 16), + gpuAgent->system_allocator()(queue_->public_handle()->size * AlignUp(sizeof(KernelArgs), 16), 16, core::MemoryRegion::AllocateNoFlags)); kernarg_async_mask_ = queue_->public_handle()->size - 1; // Obtain the number of compute units in the underlying agent. - num_cus_ = gpuAgent.properties().NumFComputeCores / 4; + num_cus_ = gpuAgent->properties().NumFComputeCores / 4; // Assemble shaders to AQL code objects. std::map kernel_names = { @@ -579,29 +580,29 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) { for (auto kernel_name : kernel_names) { KernelCode& kernel = kernels_[kernel_name.first]; - gpuAgent.AssembleShader(kernel_name.second, AMD::GpuAgent::AssembleTarget::AQL, kernel.code_buf_, + gpuAgent->AssembleShader(kernel_name.second, AMD::GpuAgent::AssembleTarget::AQL, kernel.code_buf_, kernel.code_buf_size_); } - if (agent.profiling_enabled()) { + if (agent_->profiling_enabled()) { return EnableProfiling(true); } return HSA_STATUS_SUCCESS; } -hsa_status_t BlitKernel::Destroy(const core::Agent& agent) { +hsa_status_t BlitKernel::Destroy() { std::lock_guard guard(lock_); - const AMD::GpuAgent& gpuAgent = static_cast(agent); + const AMD::GpuAgent* gpuAgent = static_cast(agent_); for (auto kernel_pair : kernels_) { - gpuAgent.ReleaseShader(kernel_pair.second.code_buf_, + gpuAgent->ReleaseShader(kernel_pair.second.code_buf_, kernel_pair.second.code_buf_size_); } if (kernarg_async_ != NULL) { - gpuAgent.system_deallocator()(kernarg_async_); + gpuAgent->system_deallocator()(kernarg_async_); } if (completion_signal_.handle != 0) { @@ -635,6 +636,11 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src, return HSA_STATUS_ERROR; } + if(agent_->profiling_enabled()) { + LogSignalDuration(HSA_AMD_LOG_FLAG_BLIT_KERNEL_PKTS, completion_signal_, + "BlitKernel::SubmitLinearCopyCommand"); + } + return HSA_STATUS_SUCCESS; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp index 429dbf0d1d..aed2ab9a71 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -171,7 +171,7 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent, bool use_xgm if (queue_start_addr_ == NULL) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } - MAKE_NAMED_SCOPE_GUARD(cleanupOnException, [&]() { Destroy(agent); };); + MAKE_NAMED_SCOPE_GUARD(cleanupOnException, [&]() { Destroy(); };); std::memset(queue_start_addr_, 0, kQueueSize); bytes_written_.resize(kQueueSize); @@ -208,7 +208,7 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent, bool use_xgm return HSA_STATUS_SUCCESS; } -template hsa_status_t BlitSdma::Destroy(const core::Agent& agent) { +template hsa_status_t BlitSdma::Destroy() { // Release all allocated resources and reset them to zero. if (queue_resource_.QueueId != 0) { diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 2edfce416f..b04241a940 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -738,7 +738,7 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi, int rec_eng) { rec_eng = uses_rec_sdma_eng_id_mask_ || !use_xgmi ? rec_eng : -1; if (sdma->Initialize(*this, use_xgmi, copy_size_override, rec_eng) != HSA_STATUS_SUCCESS) { - sdma->Destroy(*this); + sdma->Destroy(); delete sdma; sdma = nullptr; } @@ -750,7 +750,7 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) { AMD::BlitKernel* kernl = new AMD::BlitKernel(queue); if (kernl->Initialize(*this) != HSA_STATUS_SUCCESS) { - kernl->Destroy(*this); + kernl->Destroy(); delete kernl; kernl = NULL; } @@ -912,7 +912,7 @@ void GpuAgent::ReleaseResources() { this->Disable(); for (auto& blit : blits_) { if (!blit.empty()) { - hsa_status_t status = blit->Destroy(*this); + hsa_status_t status = blit->Destroy(); assert(status == HSA_STATUS_SUCCESS); } } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/utils.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/utils.h index 69c7c4a26b..18c0166b9e 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/utils.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/utils.h @@ -151,6 +151,16 @@ static __forceinline unsigned long long int strtoull(const char* str, rocr::log_printf(__FILENAME__, __LINE__, format, ##__VA_ARGS__); \ } while (false); +#define LogSignalDuration(flag, signal, msg) \ + do { \ + if (hsa_flag_isset64(log_flags, flag)) { \ + amd_signal_t* amd_signal = reinterpret_cast(signal.handle); \ + rocr::log_printf(__FILENAME__, __LINE__, \ + "%s Signal = (0x%lx), ticks start/end = %lu / %lu, Ticks elapsed = %lu", msg, signal, \ + amd_signal->start_ts, amd_signal->end_ts, amd_signal->end_ts - amd_signal->start_ts); \ + } \ + } while (false); + // A macro to remove unused variable warnings #define UNUSED(x) (void)(x)