SWDEV-539130 - Log blit copy duration (#258)

Co-authored-by: Pengda Xie <pengda.xie@amd.com>
Dieser Commit ist enthalten in:
systems-assistant[bot]
2025-09-03 10:01:47 -07:00
committet von GitHub
Ursprung b9fc643a56
Commit 83a10986a4
7 geänderte Dateien mit 35 neuen und 21 gelöschten Zeilen
@@ -71,10 +71,8 @@ class BlitKernel : public core::Blit {
///
/// @note: The call will block until all AQL packets have been executed.
///
/// @param agent Agent passed to Initialize.
///
/// @return hsa_status_t
virtual hsa_status_t Destroy(const core::Agent& agent) override;
virtual hsa_status_t Destroy() override;
/// @brief Submit an AQL packet to perform vector copy. The call is blocking
/// until the command execution is finished.
@@ -189,6 +187,8 @@ class BlitKernel : public core::Blit {
std::map<KernelType, KernelCode> kernels_;
const core::Agent* agent_;
/// AQL queue for submitting the vector copy kernel.
core::Queue* queue_;
uint32_t queue_bitmask_;
@@ -94,10 +94,8 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {
///
/// @note: The call will block until all packets have executed.
///
/// @param agent Agent passed to Initialize.
///
/// @return hsa_status_t
virtual hsa_status_t Destroy(const core::Agent& agent) override;
virtual hsa_status_t Destroy() override;
/// @brief Submit a linear copy command to the queue buffer.
///
@@ -63,7 +63,7 @@ class Blit {
/// @param agent Agent passed to Initialize.
///
/// @return hsa_status_t
virtual hsa_status_t Destroy(const core::Agent& agent) = 0;
virtual hsa_status_t Destroy() = 0;
/// @brief Submit a linear copy command to the the underlying compute device's
/// control block. The call is blocking until the command execution is
@@ -551,6 +551,7 @@ BlitKernel::BlitKernel(core::Queue* queue)
BlitKernel::~BlitKernel() {}
hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
agent_ = &agent;
queue_bitmask_ = queue_->public_handle()->size - 1;
bytes_written_.resize(queue_->public_handle()->size);
@@ -561,15 +562,15 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
return status;
}
const AMD::GpuAgent& gpuAgent = static_cast<const AMD::GpuAgent&>(agent);
const AMD::GpuAgent* gpuAgent = static_cast<const AMD::GpuAgent*>(agent_);
kernarg_async_ = reinterpret_cast<KernelArgs*>(
gpuAgent.system_allocator()(queue_->public_handle()->size * AlignUp(sizeof(KernelArgs), 16),
gpuAgent->system_allocator()(queue_->public_handle()->size * AlignUp(sizeof(KernelArgs), 16),
16, core::MemoryRegion::AllocateNoFlags));
kernarg_async_mask_ = queue_->public_handle()->size - 1;
// Obtain the number of compute units in the underlying agent.
num_cus_ = gpuAgent.properties().NumFComputeCores / 4;
num_cus_ = gpuAgent->properties().NumFComputeCores / 4;
// Assemble shaders to AQL code objects.
std::map<KernelType, const char*> kernel_names = {
@@ -579,29 +580,29 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
for (auto kernel_name : kernel_names) {
KernelCode& kernel = kernels_[kernel_name.first];
gpuAgent.AssembleShader(kernel_name.second, AMD::GpuAgent::AssembleTarget::AQL, kernel.code_buf_,
gpuAgent->AssembleShader(kernel_name.second, AMD::GpuAgent::AssembleTarget::AQL, kernel.code_buf_,
kernel.code_buf_size_);
}
if (agent.profiling_enabled()) {
if (agent_->profiling_enabled()) {
return EnableProfiling(true);
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t BlitKernel::Destroy(const core::Agent& agent) {
hsa_status_t BlitKernel::Destroy() {
std::lock_guard<std::mutex> guard(lock_);
const AMD::GpuAgent& gpuAgent = static_cast<const AMD::GpuAgent&>(agent);
const AMD::GpuAgent* gpuAgent = static_cast<const AMD::GpuAgent*>(agent_);
for (auto kernel_pair : kernels_) {
gpuAgent.ReleaseShader(kernel_pair.second.code_buf_,
gpuAgent->ReleaseShader(kernel_pair.second.code_buf_,
kernel_pair.second.code_buf_size_);
}
if (kernarg_async_ != NULL) {
gpuAgent.system_deallocator()(kernarg_async_);
gpuAgent->system_deallocator()(kernarg_async_);
}
if (completion_signal_.handle != 0) {
@@ -635,6 +636,11 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
return HSA_STATUS_ERROR;
}
if(agent_->profiling_enabled()) {
LogSignalDuration(HSA_AMD_LOG_FLAG_BLIT_KERNEL_PKTS, completion_signal_,
"BlitKernel::SubmitLinearCopyCommand");
}
return HSA_STATUS_SUCCESS;
}
@@ -171,7 +171,7 @@ hsa_status_t BlitSdma<useGCR>::Initialize(const core::Agent& agent, bool use_xgm
if (queue_start_addr_ == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
MAKE_NAMED_SCOPE_GUARD(cleanupOnException, [&]() { Destroy(agent); };);
MAKE_NAMED_SCOPE_GUARD(cleanupOnException, [&]() { Destroy(); };);
std::memset(queue_start_addr_, 0, kQueueSize);
bytes_written_.resize(kQueueSize);
@@ -208,7 +208,7 @@ hsa_status_t BlitSdma<useGCR>::Initialize(const core::Agent& agent, bool use_xgm
return HSA_STATUS_SUCCESS;
}
template <bool useGCR> hsa_status_t BlitSdma<useGCR>::Destroy(const core::Agent& agent) {
template <bool useGCR> hsa_status_t BlitSdma<useGCR>::Destroy() {
// Release all allocated resources and reset them to zero.
if (queue_resource_.QueueId != 0) {
@@ -738,7 +738,7 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi, int rec_eng) {
rec_eng = uses_rec_sdma_eng_id_mask_ || !use_xgmi ? rec_eng : -1;
if (sdma->Initialize(*this, use_xgmi, copy_size_override, rec_eng) != HSA_STATUS_SUCCESS) {
sdma->Destroy(*this);
sdma->Destroy();
delete sdma;
sdma = nullptr;
}
@@ -750,7 +750,7 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) {
AMD::BlitKernel* kernl = new AMD::BlitKernel(queue);
if (kernl->Initialize(*this) != HSA_STATUS_SUCCESS) {
kernl->Destroy(*this);
kernl->Destroy();
delete kernl;
kernl = NULL;
}
@@ -912,7 +912,7 @@ void GpuAgent::ReleaseResources() {
this->Disable();
for (auto& blit : blits_) {
if (!blit.empty()) {
hsa_status_t status = blit->Destroy(*this);
hsa_status_t status = blit->Destroy();
assert(status == HSA_STATUS_SUCCESS);
}
}
@@ -151,6 +151,16 @@ static __forceinline unsigned long long int strtoull(const char* str,
rocr::log_printf(__FILENAME__, __LINE__, format, ##__VA_ARGS__); \
} while (false);
#define LogSignalDuration(flag, signal, msg) \
do { \
if (hsa_flag_isset64(log_flags, flag)) { \
amd_signal_t* amd_signal = reinterpret_cast<amd_signal_t*>(signal.handle); \
rocr::log_printf(__FILENAME__, __LINE__, \
"%s Signal = (0x%lx), ticks start/end = %lu / %lu, Ticks elapsed = %lu", msg, signal, \
amd_signal->start_ts, amd_signal->end_ts, amd_signal->end_ts - amd_signal->start_ts); \
} \
} while (false);
// A macro to remove unused variable warnings
#define UNUSED(x) (void)(x)