From fca301772cb0eec236a23fee0beea2b0c18d19e8 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Wed, 10 Jun 2020 12:06:58 -0700 Subject: [PATCH] Add logging support for AQL packet Use AMD_LOG_LEVEL=4 and AMD_LOG_MASK=8 to print AQL log explicitly Change-Id: I4209d91b460e64be44261d3ab773580067e47c29 [ROCm/clr commit: 2b771d2f5fcdbe1f325e50426d1ba12701561f84] --- projects/clr/rocclr/device/rocm/rocdevice.cpp | 7 +-- .../clr/rocclr/device/rocm/rocvirtual.cpp | 46 ++++++++++++++++++- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 42ff473d21..1ee1504983 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -199,9 +199,10 @@ void Device::setupCpuAgent() { cpu_agent_ = cpu_agents_[index].agent; system_segment_ = cpu_agents_[index].fine_grain_pool; system_coarse_segment_ = cpu_agents_[index].coarse_grain_pool; - LogPrintfInfo("Numa select cpu agent[%u]=0x%llx(fine=0x%llx,coarse=0x%llx) for gpu agent=0x%llx", - index, cpu_agent_.handle, system_segment_.handle, system_coarse_segment_.handle, - _bkendDevice.handle); + ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Numa select cpu agent[%zu]=0x%zx(fine=0x%zx,coarse=0x%zx) " + "for gpu agent=0x%zx", + index, cpu_agent_.handle, system_segment_.handle, system_coarse_segment_.handle, + _bkendDevice.handle); } Device::~Device() { diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 326c896fb3..955f083805 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -91,6 +91,10 @@ static const hsa_barrier_and_packet_t kBarrierReleasePacket = { double Timestamp::ticksToTime_ = 0; +static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) { + return (v >> pos) & ((1 << width) - 1); +}; + bool VirtualGPU::MemoryDependency::create(size_t numMemObj) { if (numMemObj > 0) { // Allocate the array of memory objects for dependency tracking @@ -482,7 +486,31 @@ bool VirtualGPU::dispatchGenericAqlPacket( if (header != 0) { packet_store_release(reinterpret_cast(aql_loc), header, rest); } + ClPrint(amd::LOG_DEBUG, amd::LOG_AQL, + "[%zx] HWq=0x%zx, Dispatch Header = 0x%x (type=%d, barrier=%d, acquire=%d, release=%d), " + "setup=%d, grid=[%zu, %zu, %zu], workgroup=[%zu, %zu, %zu], private_seg_size=%zu, " + "group_seg_size=%zu, kernel_obj=0x%zx, kernarg_address=0x%zx, completion_signal=0x%zx", + std::this_thread::get_id(), gpu_queue_, + header, extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE), + extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, + HSA_PACKET_HEADER_WIDTH_BARRIER), + extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), + extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), + rest, reinterpret_cast(packet)->grid_size_x, + reinterpret_cast(packet)->grid_size_y, + reinterpret_cast(packet)->grid_size_z, + reinterpret_cast(packet)->workgroup_size_x, + reinterpret_cast(packet)->workgroup_size_y, + reinterpret_cast(packet)->workgroup_size_z, + reinterpret_cast(packet)->private_segment_size, + reinterpret_cast(packet)->group_segment_size, + reinterpret_cast(packet)->kernel_object, + reinterpret_cast(packet)->kernarg_address, + reinterpret_cast(packet)->completion_signal); } + //hsa_queue_store_write_index_release(gpu_queue_, index); hsa_signal_store_release(gpu_queue_->doorbell_signal, index - 1); @@ -547,9 +575,23 @@ void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) { hsa_barrier_and_packet_t* aql_loc = &(reinterpret_cast(gpu_queue_->base_address))[index & queueMask]; *aql_loc = *packet; - __atomic_store_n(reinterpret_cast(aql_loc), kBarrierPacketHeader, __ATOMIC_RELEASE); + __atomic_store_n(reinterpret_cast(aql_loc), kBarrierPacketHeader, __ATOMIC_RELEASE); - hsa_signal_store_release(gpu_queue_->doorbell_signal, index); + hsa_signal_store_release(gpu_queue_->doorbell_signal, index); + ClPrint(amd::LOG_DEBUG, amd::LOG_AQL, + "[%zx] HWq=0x%zx, BarrierAND Header = 0x%x (type=%d, barrier=%d, acquire=%d, release=%d), " + "dep_signal=[0x%zx, 0x%zx, 0x%zx, 0x%zx, 0x%zx], completion_signal=0x%zx", + std::this_thread::get_id(), gpu_queue_, kBarrierPacketHeader, + extractAqlBits(kBarrierPacketHeader, HSA_PACKET_HEADER_TYPE, + HSA_PACKET_HEADER_WIDTH_TYPE), + extractAqlBits(kBarrierPacketHeader, HSA_PACKET_HEADER_BARRIER, + HSA_PACKET_HEADER_WIDTH_BARRIER), + extractAqlBits(kBarrierPacketHeader, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE), + extractAqlBits(kBarrierPacketHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE), + packet->dep_signal[0], packet->dep_signal[1], packet->dep_signal[2], + packet->dep_signal[3], packet->dep_signal[4], packet->completion_signal); } /**