Files
rocm-systems/wddm/queue.cpp
T
Flora Cui 0761dd0146 librocdxg: Increase AQL frame size calculation
to prevent PM4 command buffer overflow

Signed-off-by: Flora Cui <flora.cui@amd.com>
Reviewed-by: Longlong Yao <Longlong.Yao@amd.com>
Part-of: <http://10.67.69.192/wsl/rocr-runtime/-/merge_requests/113>
2025-11-28 14:53:07 +08:00

1141 líneas
37 KiB
C++

////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#include <cstring>
#include <cinttypes>
#include <cstddef>
#include "impl/wddm/queue.h"
#include "impl/registers.h"
#include "impl/hsa/hsa.h"
#include "impl/hsa/hsa_ven_amd_loader.h"
extern hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal);
extern hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed(
hsa_signal_t signal, hsa_signal_condition_t condition,
hsa_signal_value_t compare_value, uint64_t timeout_hint,
hsa_wait_state_t wait_state_hint);
extern void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal,
hsa_signal_value_t value);
extern hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address(
const void *device_address, const void **host_address);
namespace wsl {
namespace thunk {
hsa_status_t WDDMQueue::SwsInit(void) {
if (!device->CreateSyncobj(&syncobj, &sync_addr))
return HSA_STATUS_ERROR;
if (device->AllocUserQueueMemFromUMD()) {
GpuMemory *gpu_mem = nullptr;
GpuMemoryCreateInfo create_info{};
create_info.domain = thunk_proxy::kUserQueue;
create_info.size = device->GetSwsQueueSize();
create_info.engine_flag = thunk_proxy::QueueEngine2EngineFlag(queue_engine);
auto code = device->CreateGpuMemory(create_info, &gpu_mem);
if (code != ErrorCode::Success) {
device->DestroySyncobj(syncobj);
return HSA_STATUS_ERROR;
}
queue_mem = gpu_mem->GetGpuMemoryHandle();
queue = gpu_mem->GetAllocationHandle(0);
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t WDDMQueue::SwsFini(void) {
device->DestroySyncobj(syncobj);
return HSA_STATUS_SUCCESS;
}
hsa_status_t WDDMQueue::SwsSubmit(uint64_t command_addr,
uint64_t command_size,
uint64_t fence_value) {
if (!device->SubmitToSwQueue(this, command_addr, command_size, fence_value))
return HSA_STATUS_ERROR;
return HSA_STATUS_SUCCESS;
}
hsa_status_t WDDMQueue::HwsInit(void) {
if (!device->CreateHwQueue(this))
return HSA_STATUS_ERROR;
return HSA_STATUS_SUCCESS;
}
hsa_status_t WDDMQueue::HwsFini(void) {
if (!device->DestroyHwQueue(this))
return HSA_STATUS_ERROR;
return HSA_STATUS_SUCCESS;
}
hsa_status_t WDDMQueue::HwsSubmit(uint64_t command_addr,
uint64_t command_size,
uint64_t fence_value) {
if (!device->SubmitToHwQueue(this, command_addr, command_size, fence_value))
return HSA_STATUS_ERROR;
return HSA_STATUS_SUCCESS;
}
hsa_status_t WDDMQueue::SetPriority(hsa_amd_queue_priority_t priority) {
if (!use_hws)
return HSA_STATUS_SUCCESS;
thunk_proxy::SchedLevel new_prio = ConvertSchedLevel(priority);
if (prio == new_prio)
return HSA_STATUS_SUCCESS;
pr_debug("set prio %d -> %d\n", prio, new_prio);
device->DestroyHwQueue(this);
prio = new_prio;
return HwsInit();
}
void ComputeQueue::HandleError(hsa_status_t status) {
hsa_signal_t sig = amd_queue_rocr_->queue_inactive_signal;
hsa_signal_value_t val = -1;
struct queue_error_t {
uint32_t code;
hsa_status_t status;
};
static const queue_error_t QueueErrors[] = {
{2, HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS},
{4, HSA_STATUS_ERROR_INVALID_ALLOCATION},
{8, HSA_STATUS_ERROR_INVALID_CODE_OBJECT},
//{16, HSA_STATUS_ERROR_INVALID_ARGUMENT},
{32, HSA_STATUS_ERROR_INVALID_PACKET_FORMAT},
{64, HSA_STATUS_ERROR_INVALID_ARGUMENT},
//{128, HSA_STATUS_ERROR_OUT_OF_REGISTERS},
//{0x20000000, HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION},
//{0x40000000, HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION},
{0x80000000, HSA_STATUS_ERROR_EXCEPTION},
};
for (std::size_t i = 0; i < sizeof(QueueErrors) / sizeof(QueueErrors[0]); ++i) {
if (QueueErrors[i].status == status) {
val = QueueErrors[i].code;
pr_err("error %d, sig_val %ld\n", status, val);
break;
}
}
if (sig.handle) {
hsakmt_hsa_signal_store_screlease(sig, val);
}
if (error_code_) {
error_code_->store(val, std::memory_order_release);
}
}
void ComputeQueue::AqlToPm4Thread(ComputeQueue *queue) {
// This timing system is used for sleeping this Thread
// when one packet is invalid for about 2 seconds.
std::chrono::steady_clock::time_point start_time, time;
// Set the polling timeout value for 2 seconds
const std::chrono::milliseconds kMaxElapsed(2000);
uint64_t current_position = queue->GetAqlWriteIndex();
bool sleep = false;
start_time = std::chrono::steady_clock::now();
while (true) {
if (!queue->IsInvalidPacket()) {
hsa_status_t status = queue->Process();
if (status != HSA_STATUS_SUCCESS) {
pr_err("process compute queue fail status = %08x\n", status);
queue->HandleError(status);
break;
}
sleep = false;
} else {
if (current_position == queue->GetAqlWriteIndex()) {
time = std::chrono::steady_clock::now();
if (time - start_time > kMaxElapsed)
sleep = true;
} else {
start_time = std::chrono::steady_clock::now();
current_position = queue->GetAqlWriteIndex();
sleep = false;
}
}
if ((queue->GetRingWptr()->load() > queue->GetRingRptr()->load()) && !sleep)
continue;
std::unique_lock<std::mutex> lock(queue->thread_cond_lock_);
// CPU wait for valid packet
if (queue->GetRingWptr()->load() <= queue->GetRingRptr()->load() ||
(sleep && queue->IsInvalidPacket())) {
if (queue->thread_stop_)
break;
pr_debug("wait %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n",
queue->ring, queue->GetRingWptr()->load(), queue->GetRingRptr()->load());
queue->thread_cond_.wait(lock);
}
}
pr_debug("aql to pm4 thread %p exit\n", queue->ring);
}
ComputeQueue::ComputeQueue(WDDMDevice *device,
void *ring,
uint64_t ring_size,
std::atomic<uint64_t> *ring_wptr,
std::atomic<uint64_t> *ring_rptr,
volatile int64_t *error_addr,
uint32_t cmdbuf_size,
uint32_t engine,
bool use_hws) :
WDDMQueue(device, 0, cmdbuf_size, engine, use_hws),
ring(ring),
ring_size(ring_size),
ring_wptr(ring_wptr),
ring_rptr(ring_rptr),
error_code_(reinterpret_cast<volatile std::atomic<long int>*>(error_addr)),
ib_start_addr(0),
ib_size(0),
sync_point(0),
cmdbuf_aql_frame_write_index(0),
cmdbuf_aql_frame_size(0),
needs_barrier(true),
ready_to_submit(false),
platform_atomic_support_(false),
signal_addr_(NULL),
thread_stop_(false),
scratch_waves_(device->MaxScratchSlotsPerCu() * device->ComputeUnitCount()),
scratch_size_per_wave_(0),
scratch_size_(0),
scratch_base_(nullptr) {
bool ret = device->CreateQueue(this);
assert(ret);
GpuMemoryCreateInfo create_info{};
create_info.size = dxg_runtime->page_size;
create_info.domain = thunk_proxy::kSystem;
GpuMemory *gpu_mem = nullptr;
auto code = device->CreateGpuMemory(create_info, &gpu_mem);
assert(code == ErrorCode::Success);
amd_queue_mem_ = gpu_mem->GetGpuMemoryHandle();
amd_queue_ = reinterpret_cast<amd_queue_v2_t*>(gpu_mem->GpuAddress());
amd_queue_rocr_ = (amd_queue_v2_t*)((char*)ring_rptr - offsetof(amd_queue_v2_t, read_dispatch_id));
aql_to_pm4_thread_ = std::thread(AqlToPm4Thread, this);
}
ComputeQueue::~ComputeQueue() {
thread_cond_lock_.lock();
thread_stop_ = true;
thread_cond_lock_.unlock();
thread_cond_.notify_one();
aql_to_pm4_thread_.join();
//doorbell_signal_->Release();
device->DestroyQueue(this);
if (scratch_base_) {
auto scratch_gpu_mem = GpuMemory::Convert(scratch_mem_);
delete scratch_gpu_mem;
}
auto amd_queue_gpu_mem = GpuMemory::Convert(amd_queue_mem_);
delete amd_queue_gpu_mem;
}
void ComputeQueue::InitScratchSRD() {
// Populate scratch resource descriptor
SQ_BUF_RSRC_WORD0 srd0;
uintptr_t scratch_base = uintptr_t(scratch_base_);
srd0.bits.BASE_ADDRESS = scratch_base;
uint32_t srd1_u32;
if (device->Major() < 11) {
SQ_BUF_RSRC_WORD1 srd1;
srd1.bits.BASE_ADDRESS_HI = scratch_base >> 32;
srd1.bits.STRIDE = 0;
srd1.bits.CACHE_SWIZZLE = 0;
srd1.bits.SWIZZLE_ENABLE = 1;
srd1_u32 = srd1.u32All;
} else {
SQ_BUF_RSRC_WORD1_GFX11 srd1;
srd1.bits.BASE_ADDRESS_HI = scratch_base >> 32;
srd1.bits.STRIDE = 0;
srd1.bits.SWIZZLE_ENABLE = 1;
srd1_u32 = srd1.u32All;
}
SQ_BUF_RSRC_WORD2 srd2;
srd2.bits.NUM_RECORDS = scratch_size_;
uint32_t srd3_u32;
if (device->Major() < 10) {
SQ_BUF_RSRC_WORD3 srd3;
srd3.bits.DST_SEL_X = SQ_SEL_X;
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
srd3.bits.DST_SEL_W = SQ_SEL_W;
srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT;
srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32;
srd3.bits.ELEMENT_SIZE = 1; // 4
srd3.bits.INDEX_STRIDE = 3; // 64
srd3.bits.ADD_TID_ENABLE = 1;
srd3.bits.ATC__CI__VI = 0;
srd3.bits.HASH_ENABLE = 0;
srd3.bits.HEAP = 0;
srd3.bits.MTYPE__CI__VI = 0;
srd3.bits.TYPE = SQ_RSRC_BUF;
srd3_u32 = srd3.u32All;
} else if (device->Major() == 10) {
SQ_BUF_RSRC_WORD3_GFX10 srd3;
srd3.bits.DST_SEL_X = SQ_SEL_X;
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
srd3.bits.DST_SEL_W = SQ_SEL_W;
srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
srd3.bits.RESERVED1 = 0;
srd3.bits.INDEX_STRIDE = 0; // filled in by CP
srd3.bits.ADD_TID_ENABLE = 1;
srd3.bits.RESOURCE_LEVEL = 1;
srd3.bits.RESERVED2 = 0;
srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode
srd3.bits.TYPE = SQ_RSRC_BUF;
srd3_u32 = srd3.u32All;
} else if (device->Major() == 11) {
SQ_BUF_RSRC_WORD3_GFX11 srd3;
srd3.bits.DST_SEL_X = SQ_SEL_X;
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
srd3.bits.DST_SEL_W = SQ_SEL_W;
srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
srd3.bits.RESERVED1 = 0;
srd3.bits.INDEX_STRIDE = 0; // filled in by CP
srd3.bits.ADD_TID_ENABLE = 1;
srd3.bits.RESERVED2 = 0;
srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode
srd3.bits.TYPE = SQ_RSRC_BUF;
srd3_u32 = srd3.u32All;
} else {
SQ_BUF_RSRC_WORD3_GFX12 srd3;
srd3.bits.DST_SEL_X = SQ_SEL_X;
srd3.bits.DST_SEL_Y = SQ_SEL_Y;
srd3.bits.DST_SEL_Z = SQ_SEL_Z;
srd3.bits.DST_SEL_W = SQ_SEL_W;
srd3.bits.FORMAT = BUF_FORMAT_32_UINT;
srd3.bits.RESERVED1 = 0;
srd3.bits.INDEX_STRIDE = 0; // filled in by CP
srd3.bits.ADD_TID_ENABLE = 1;
srd3.bits.WRITE_COMPRESS_ENABLE = 0;
srd3.bits.COMPRESSION_EN = 0;
srd3.bits.COMPRESSION_ACCESS_MODE = 0;
srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode
srd3.bits.TYPE = SQ_RSRC_BUF;
srd3_u32 = srd3.u32All;
}
// Update Queue's Scratch descriptor's property
amd_queue_->scratch_resource_descriptor[0] = srd0.u32All;
amd_queue_->scratch_resource_descriptor[1] = srd1_u32;
amd_queue_->scratch_resource_descriptor[2] = srd2.u32All;
amd_queue_->scratch_resource_descriptor[3] = srd3_u32;
// Populate flat scratch parameters in amd_queue_.
amd_queue_->scratch_backing_memory_location = scratch_base;
// For backwards compatibility this field records the per-lane scratch
// for a 64 lane wavefront. If scratch was allocated for 32 lane waves
// then the effective size for a 64 lane wave is halved.
amd_queue_->scratch_wave64_lane_byte_size = scratch_size_per_wave_ / 64;
if (device->Major() < 11) {
COMPUTE_TMPRING_SIZE tmpring_size;
tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / 1024;
tmpring_size.bits.WAVES = scratch_waves_;
amd_queue_->compute_tmpring_size = tmpring_size.u32All;
} else if (device->Major() == 11) {
COMPUTE_TMPRING_SIZE_GFX11 tmpring_size;
tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ >> 8;
tmpring_size.bits.WAVES = scratch_waves_ / device->NumShaderEngine();
amd_queue_->compute_tmpring_size = tmpring_size.u32All;
} else {
COMPUTE_TMPRING_SIZE_GFX12 tmpring_size = {};
tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ >> 8;
tmpring_size.bits.WAVES = scratch_waves_ / device->NumShaderEngine();
amd_queue_->compute_tmpring_size = tmpring_size.u32All;
}
return;
}
bool ComputeQueue::UpdateScratch(uint32_t private_segment_size, bool wave32) {
const uint32_t wavefront = wave32 ? 32 : 64;
const uint32_t alignment = 1024 / wavefront;
private_segment_size = AlignUp(private_segment_size, alignment);
uint32_t scratch_size_per_wave = private_segment_size * wavefront;
uint32_t scratch_size = scratch_size_per_wave * scratch_waves_;
if (scratch_size_ >= scratch_size)
return true;
pr_debug("need realloc scratch buffer, size %x -> %x\n",
scratch_size_, scratch_size);
GpuMemoryCreateInfo create_info{};
create_info.size = scratch_size;
create_info.domain = thunk_proxy::kLocal;
GpuMemory *gpu_mem = nullptr;
auto code = device->CreateGpuMemory(create_info, &gpu_mem);
if (code != ErrorCode::Success)
return false;
if (scratch_base_) {
auto scratch_gpu_mem = GpuMemory::Convert(scratch_mem_);
delete scratch_gpu_mem;
}
scratch_size_per_wave_ = scratch_size_per_wave;
scratch_size_ = scratch_size;
scratch_base_ = reinterpret_cast<void *>(gpu_mem->GpuAddress());
scratch_mem_ = gpu_mem->GetGpuMemoryHandle();
InitScratchSRD();
return true;
}
bool ComputeQueue::RelocateCmdbufScratchBase(uint64_t addr) {
if (scratch_base_offset_array_.empty())
return true;
for (size_t i = 0; i < scratch_base_offset_array_.size(); i++) {
uint32_t *p_compute_user_data =
reinterpret_cast<uint32_t *>(addr + scratch_base_offset_array_[i]);
if (device->Major() >= 11) {
p_compute_user_data[0] = Ptr48Low32(scratch_base_);
p_compute_user_data[1] = Ptr48High8(scratch_base_);
} else {
p_compute_user_data[0] = PtrLow32(scratch_base_);
p_compute_user_data[1] = (p_compute_user_data[1] & 0xffff0000) | PtrHigh32(scratch_base_);
}
}
scratch_base_offset_array_.clear();
return true;
}
uint32_t ComputeQueue::UpdateIndexStride(uint32_t srd, bool wave32) {
assert(device->Major() < 13);
if (device->Major() == 10) {
SQ_BUF_RSRC_WORD3_GFX10 srd3;
srd3.u32All = srd;
srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3;
return srd3.u32All;
} else if (device->Major() == 11) {
SQ_BUF_RSRC_WORD3_GFX11 srd3;
srd3.u32All = srd;
srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3;
return srd3.u32All;
} else if (device->Major() == 12) {
SQ_BUF_RSRC_WORD3_GFX12 srd3;
srd3.u32All = srd;
srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3;
return srd3.u32All;
}
return srd;
}
uint64_t ComputeQueue::GetKernelObjAddr(uint64_t addr) const {
/* convert dev_addr to host_addr */
auto code = get_gpu_mem((void*)addr);
if (code && code->IsBlitKernelObject()) {
return code->GpuAddress();
}
uint64_t host_addr = 0;
auto ret = hsakmt_hsa_ven_amd_loader_query_host_address(reinterpret_cast<const void *>(addr),
reinterpret_cast<const void **>(&host_addr));
if (ret == HSA_STATUS_SUCCESS) {
return host_addr;
}
pr_err("failed to query host address for kernel object %p, ret=%d\n", (void*)addr, ret);
return 0;
}
void ComputeQueue::RingDoorbell() {
thread_cond_lock_.lock();
thread_cond_lock_.unlock();
pr_debug("notify %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n",
ring, GetRingWptr()->load(), GetRingRptr()->load());
thread_cond_.notify_one();
}
hsa_status_t ComputeQueue::Init(void) {
hsa_status_t ret = use_hws ? HwsInit() : SwsInit();
if (ret)
return ret;
ib_start_addr = cmdbuf_addr;
cmdbuf_aql_frame_size = device->GetAqlFrameSize();
platform_atomic_support_ = device->SupportPlatformAtomic();
return ret;
}
hsa_status_t ComputeQueue::Fini(void) {
return use_hws ? HwsFini() : SwsFini();
}
hsa_status_t ComputeQueue::PreSubmit(void) {
if (!device->WaitPagingFence(this))
return HSA_STATUS_ERROR;
RelocateCmdbufScratchBase(ib_start_addr);
return HSA_STATUS_SUCCESS;
}
hsa_status_t ComputeQueue::EndSubmit(void) {
// record last submitted cmdbuf_aql_frame_write_index to see if GPU is hungry
sync_point = cmdbuf_aql_frame_write_index;
ib_start_addr = cmdbuf_addr +
(cmdbuf_aql_frame_write_index % WDDMDevice::GetAqlFrameNum()) *
cmdbuf_aql_frame_size;
ib_size = 0;
return HSA_STATUS_SUCCESS;
}
hsa_status_t ComputeQueue::Submit(void) {
hsa_status_t ret = PreSubmit();
if (ret)
return HSA_STATUS_ERROR;
ret = use_hws ?
HwsSubmit(ib_start_addr, ib_size, cmdbuf_aql_frame_write_index) :
SwsSubmit(ib_start_addr, ib_size, cmdbuf_aql_frame_write_index);
if (ret)
return HSA_STATUS_ERROR;
ret = EndSubmit();
if (ret)
return HSA_STATUS_ERROR;
return HSA_STATUS_SUCCESS;
}
hsa_status_t
ComputeQueue::KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet) {
pr_debug("queue %p kernel dispatch head=%x setup=%x wx=%x wy=%x wz=%x "
"gx=%x gy=%x gz=%x ps=%x gs=%x ko=%" PRIx64 " ka=%p cs=%" PRIx64 "\n",
ring, packet->header,
packet->setup, packet->workgroup_size_x, packet->workgroup_size_y,
packet->workgroup_size_z, packet->grid_size_x, packet->grid_size_y,
packet->grid_size_z, packet->private_segment_size,
packet->group_segment_size, packet->kernel_object, packet->kernarg_address,
packet->completion_signal.handle);
if (packet->workgroup_size_x > 1024 ||
packet->workgroup_size_y > 1024 ||
packet->workgroup_size_z > 1024)
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
int major = device->Major();
int i = ib_size;
const amd_kernel_code_t* kernel_object =
(const amd_kernel_code_t *)GetKernelObjAddr(packet->kernel_object);
if (kernel_object == NULL) {
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
}
void* entry = (void*)(packet->kernel_object + kernel_object->kernel_code_entry_byte_offset);
assert((size_t)entry % AMD_ISA_ALIGN_BYTES == 0);
pr_debug("kernel object property=%x entry=%p lds=%x+%x\n",
kernel_object->kernel_code_properties, entry,
kernel_object->workgroup_group_segment_byte_size,
packet->group_segment_size);
if (packet->setup == 0 || packet->setup > 3)
return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS;
if (packet->group_segment_size > device->LdsSize())
return HSA_STATUS_ERROR_INVALID_ALLOCATION;
uint32_t lds_blks = device->LdsBlocks(packet);
if (lds_blks > 128)
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
const bool wave32 =
AMD_HSA_BITS_GET(kernel_object->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32);
assert(packet->private_segment_size >= kernel_object->workitem_private_segment_byte_size);
UpdateScratch(packet->private_segment_size, wave32);
amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle;
// Record start timestamp when enabling profiling
if (signal && EnableProfiling())
i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i);
// Build a barrier packet if it is requested
const bool is_barrier_packet = (packet->header >> HSA_PACKET_HEADER_BARRIER) & 0x1;
if (is_barrier_packet && needs_barrier)
i += cmd_util.BuildBarrier(cpu + i);
// flush cache
i += cmd_util.BuildAcquireMem(major, cpu + i);
if (major >= 11) {
AppendCmdbufSratchBaseOffset(
i + offsetof(struct SetScratchTemplate, scratch_lo));
i += cmd_util.BuildScratch(ScratchBase(), cpu + i);
i += cmd_util.BuildComputeShaderParams(cpu + i);
}
struct DispatchInfo info;
info.major = major;
info.pPacket = packet;
info.pEntry = entry;
info.pKernelObject = kernel_object;
info.ldsBlks = lds_blks;
info.pAmdQueue = amd_queue_;
info.wave32 = wave32;
info.srd = UpdateIndexStride(
info.pAmdQueue->scratch_resource_descriptor[3], wave32);
info.pScratchBase = ScratchBase();
info.scratchSizePerWave = ScratchSizePerWave();
memset(info.scratchBaseOffset, 0, sizeof(info.scratchBaseOffset));
info.offsetCnt = 0;
size_t size;
size = cmd_util.BuildDispatch(&info, cpu + i);
for (int j = 0; j < info.offsetCnt; j++)
AppendCmdbufSratchBaseOffset(i + info.scratchBaseOffset[j]);
i += size;
needs_barrier = (packet->completion_signal.handle == 0);
if (signal) {
// wait cs done
i += cmd_util.BuildBarrier(cpu + i);
// Record end timestamp when enabling profiling
if (EnableProfiling())
i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i);
// flush cache
i += cmd_util.BuildAcquireMem(major, cpu + i);
assert(signal->kind == AMD_SIGNAL_KIND_USER);
uint64_t *signal_addr = (uint64_t *)&signal->value;
pr_debug("signal value=%" PRIx64 "\n", signal->value);
if (platform_atomic_support_)
i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1);
else
signal_addr_ = signal_addr;
}
// The ring_rptr is used to record pm4 queue rptr value,
// dispatch readptr position, this is used to share rptr with
// aql queue.
if (platform_atomic_support_)
i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i);
else
i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1);
// Check if we exceeded the frame size
if ((i - ib_size) > cmdbuf_aql_frame_size) {
pr_err("PM4 command buffer overflow in KernelDispatch: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size);
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
ib_size = i;
cmdbuf_aql_frame_write_index++;
packet->header = HSA_PACKET_TYPE_INVALID;
return HSA_STATUS_SUCCESS;
}
hsa_status_t
ComputeQueue::BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or) {
pr_debug("queue %p %s head=%x dep %" PRIx64 " %" PRIx64 " %" PRIx64
" %" PRIx64 " %" PRIx64 " cs=%" PRIx64"\n",
ring, is_or ? "or" : "and",
packet->header, packet->dep_signal[0].handle,
packet->dep_signal[1].handle, packet->dep_signal[2].handle,
packet->dep_signal[3].handle, packet->dep_signal[4].handle,
packet->completion_signal.handle);
// fix me: can we use gpu packet?
if (is_or) {
bool unsignaled = true;
hsa_signal_t sig[5];
int n = 0;
for (int i = 0; i < 5; i++) {
if (packet->dep_signal[i].handle)
sig[n++] = packet->dep_signal[i];
}
while (n) {
for (int i = 0; i < n; i++) {
if (!hsakmt_hsa_signal_load_relaxed(sig[i])) {
unsignaled = false;
break;
}
}
if (!unsignaled)
break;
std::this_thread::sleep_for(std::chrono::microseconds(20));
}
} else {
for (int i = 0; i < 5; i++) {
if (!packet->dep_signal[i].handle)
continue;
hsa_signal_value_t value =
hsakmt_hsa_signal_wait_relaxed(packet->dep_signal[i], HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
assert(value == 0);
}
}
int major = device->Major();
int i = ib_size;
if (packet->completion_signal.handle != 0) {
amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle;
assert(signal->kind == AMD_SIGNAL_KIND_USER);
uint64_t *signal_addr = (uint64_t *)&signal->value;
pr_debug("signal value=%" PRIx64 "\n", signal->value);
// Record start timestamp when enabling profiling
if (EnableProfiling())
i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i);
if (needs_barrier)
i += cmd_util.BuildBarrier(cpu + i);
needs_barrier = false;
// Record end timestamp when enabling profiling
if (EnableProfiling())
i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i);
// flush cache
i += cmd_util.BuildAcquireMem(major, cpu + i);
if (platform_atomic_support_)
i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1);
else
signal_addr_ = signal_addr;
}
// The ring_rptr is used to record pm4 queue rptr value,
// dispatch readptr position, this is used to share rptr with
// aql queue.
if (platform_atomic_support_)
i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i);
else
i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1);
// Check if we exceeded the frame size
if ((i - ib_size) > cmdbuf_aql_frame_size) {
pr_err("PM4 command buffer overflow in BarrierGeneric: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size);
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
ib_size = i;
cmdbuf_aql_frame_write_index++;
packet->header = HSA_PACKET_TYPE_INVALID;
return HSA_STATUS_SUCCESS;
}
hsa_status_t ComputeQueue::VendorSpecificAqlToPm4(char *cpu, amd_aql_pm4_ib *packet) {
constexpr uint32_t AMD_AQL_FORMAT_PM4_IB = 0x1;
assert(packet->ven_hdr == AMD_AQL_FORMAT_PM4_IB);
uint8_t op = (packet->ib_jump_cmd[0] >> PM4_OPCODE_SHIFT) & 0xff;
assert(op == IT_INDIRECT_BUFFER);
uint32_t* pm4_addr = reinterpret_cast<uint32_t*>((static_cast<uint64_t>(packet->ib_jump_cmd[2]) << 32) | (static_cast<uint64_t>(packet->ib_jump_cmd[1]) & ~3ull));
uint32_t pm4_size = packet->ib_jump_cmd[3]&0xfffff;
pr_debug("queue %p %s VENDOR_SPECIFIC pkt pm4_addr %p pm4_size %#x cs=%" PRIx64"\n",
ring, dxg_runtime->vendor_packet_process ? "process" : "skip", pm4_addr, pm4_size,
packet->completion_signal.handle);
for (int i = 0; i < pm4_size; i++) {
pr_debug("pm4_addr[%d]=%#x\n", i, pm4_addr[i]);
}
int i = ib_size;
if (dxg_runtime->vendor_packet_process) {
int major = device->Major();
memcpy(cpu+i, pm4_addr, pm4_size * sizeof(uint32_t));
i += pm4_size * sizeof(uint32_t);
if (packet->completion_signal.handle != 0) {
amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle;
assert(signal->kind == AMD_SIGNAL_KIND_USER);
uint64_t *signal_addr = (uint64_t *)&signal->value;
pr_debug("signal value=%" PRIx64 "\n", signal->value);
// Record start timestamp when enabling profiling
if (EnableProfiling())
i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i);
//if (needs_barrier)
i += cmd_util.BuildBarrier(cpu + i);
//needs_barrier = false;
// Record end timestamp when enabling profiling
if (EnableProfiling())
i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i);
// flush cache
i += cmd_util.BuildAcquireMem(major, cpu + i);
if (platform_atomic_support_)
i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1);
else
signal_addr_ = signal_addr;
}
} else {
if (packet->completion_signal.handle != 0) {
hsakmt_hsa_signal_store_screlease(packet->completion_signal, 0);
}
}
// The ring_rptr is used to record pm4 queue rptr value,
// dispatch readptr position, this is used to share rptr with
// aql queue.
if (platform_atomic_support_)
i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i);
else
i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1);
// Check if we exceeded the frame size
if ((i - ib_size) > cmdbuf_aql_frame_size) {
pr_err("PM4 command buffer overflow in VendorSpecific: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size);
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
ib_size = i;
cmdbuf_aql_frame_write_index++;
packet->header = HSA_PACKET_TYPE_INVALID;
return HSA_STATUS_SUCCESS;
}
hsa_status_t ComputeQueue::SwitchAql2PM4(void) {
uint16_t *packet = (uint16_t *) ((char *)ring +
(cmdbuf_aql_frame_write_index % ring_size) * 64);
uint16_t header = (*packet >> HSA_PACKET_HEADER_TYPE);
header &= (1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1;
hsa_kernel_dispatch_packet_t *aql_packet =
(hsa_kernel_dispatch_packet_t *)packet;
hsa_status_t ret;
switch (header) {
case HSA_PACKET_TYPE_KERNEL_DISPATCH:
ret = KernelDispatchAqlToPm4((char *)ib_start_addr, aql_packet);
if (ret != HSA_STATUS_SUCCESS)
return ret;
// Stop merging packages util below conditions are met:
// 1) The kernel with completion signal;
// 2) The cmdbuf_aql_frame_write_index reaches the end of cmdbuf
// 3) The queue is empty now, submit the package right now.
if (!(aql_packet->completion_signal.handle) &&
(cmdbuf_aql_frame_write_index % WDDMDevice::GetAqlFrameNum()) &&
(*sync_addr != sync_point))
return HSA_STATUS_SUCCESS;
break;
case HSA_PACKET_TYPE_BARRIER_AND:
BarrierGenericAqlToPm4((char *)ib_start_addr, (hsa_barrier_and_packet_t *)aql_packet);
break;
case HSA_PACKET_TYPE_BARRIER_OR:
BarrierGenericAqlToPm4((char *)ib_start_addr, (hsa_barrier_and_packet_t *)aql_packet, true);
break;
case HSA_PACKET_TYPE_VENDOR_SPECIFIC:
VendorSpecificAqlToPm4((char *)ib_start_addr, (amd_aql_pm4_ib *)aql_packet);
break;
case HSA_PACKET_TYPE_INVALID:
// When packets are submitted out of order, the format field of current AQL packet
// may not have been updated yet and is still INVALID. Return HSA_STATUS_SUCCESS and
// do not process AQL packets before the packet format field is updated.
assert(false && "Should not reach here, HSA_PACKET_TYPE_INVALID has been filtered in upper layer");
return HSA_STATUS_SUCCESS;
default:
return HSA_STATUS_ERROR_INVALID_PACKET_FORMAT;
}
ready_to_submit = true;
return HSA_STATUS_SUCCESS;
}
hsa_status_t ComputeQueue::Process(void) {
while (cmdbuf_aql_frame_write_index < ring_wptr->load() &&
!IsInvalidPacket()) {
pr_debug("process %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n",
ring, ring_wptr->load(), ring_rptr->load());
hsa_status_t ret;
// wait for next few cmdbuf slots to be free
// If wptr catch up the rptr in the cmdbuf, this needs wait for the rptr to free the cmdbuf.
// Here the wptr comes from queue->cmdbuf_aql_frame_write_index, while rptr comes from *queue->sync_addr.
if (*sync_addr + WDDMDevice::GetAqlFrameNum() <= cmdbuf_aql_frame_write_index) {
uint64_t value = cmdbuf_aql_frame_write_index - WDDMDevice::GetAqlFrameNum() + 1;
if (!device->CpuWait(&syncobj, &value, 1, false))
return HSA_STATUS_ERROR;
}
ret = SwitchAql2PM4();
if (ret != HSA_STATUS_SUCCESS)
return ret;
if (!ready_to_submit)
continue;
ret = Submit();
if (ret != HSA_STATUS_SUCCESS)
return ret;
// CPU wait for GPU fence, and cpu update the signal.
if (!platform_atomic_support_ && signal_addr_) {
// CPU wait for GPU fence
if (!device->CpuWait(&syncobj, &cmdbuf_aql_frame_write_index, 1, false))
return HSA_STATUS_ERROR;
//CPU update completional signal
atomic::Decrement(signal_addr_);
signal_addr_ = NULL;
}
ready_to_submit = false;
pr_debug("done %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n",
ring, ring_wptr->load(), ring_rptr->load());
}
return HSA_STATUS_SUCCESS;
}
void SDMAQueue::SdmaThread(SDMAQueue *queue) {
while (true) {
decltype(queue->wptr_queue_) pendings;
{
std::unique_lock<std::mutex> lock(queue->thread_cond_lock_);
while (queue->wptr_queue_.empty() && !queue->thread_stop_)
queue->thread_cond_.wait(lock);
if (queue->thread_stop_)
break;
pendings.swap(queue->wptr_queue_);
}
for (const auto [start, end] : pendings) {
pr_debug("wptr %lx %lx\n", start, end);
SDMA_PKT_POLL_REGMEM* poll_pkt = reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(queue->cmdbuf_addr + queue->WrapIntoRocrRing(start));
SDMA_PKT_POLL_REGMEM* poll_next_pkt = poll_pkt + 1;
while (queue->IsPollPacket(poll_pkt)) {
uint64_t poll_addr = poll_pkt->ADDR_LO_UNION.addr_31_0 |
(uint64_t)poll_pkt->ADDR_HI_UNION.addr_63_32 << 32;
uint64_t poll_val = poll_pkt->VALUE_UNION.value;
uint32_t skip = 1;
if (queue->IsPollPacket(poll_next_pkt)) {
uint64_t poll_next_addr = poll_next_pkt->ADDR_LO_UNION.addr_31_0 |
(uint64_t)poll_next_pkt->ADDR_HI_UNION.addr_63_32 << 32;
if (poll_next_addr + sizeof(uint32_t) == poll_addr) {
poll_addr = poll_next_addr;
poll_val = poll_next_pkt->VALUE_UNION.value |
(uint64_t)poll_pkt->VALUE_UNION.value << 32;
skip = 2;
}
}
amd_signal_t* signal = (amd_signal_t*)((char*)poll_addr - offsetof(amd_signal_t, value));
uint64_t signal_handle = reinterpret_cast<uint64_t>(signal);
pr_debug("poll signal %#lx addr %#lx val %ld\n", signal_handle, poll_addr, poll_val);
hsa_signal_t hsa_signal = {signal_handle};
hsa_signal_value_t value =
hsakmt_hsa_signal_wait_relaxed(hsa_signal, HSA_SIGNAL_CONDITION_EQ, poll_val, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
assert(value == poll_val);
memset(poll_pkt, 0, skip * sizeof(*poll_pkt));
poll_pkt += skip;
poll_next_pkt += skip;
}
queue->PreparePacket(queue->WrapIntoRocrRing(start), end - start);
std::atomic_thread_fence(std::memory_order_release);
queue->Submit();
}
}
pr_debug("sdma thread exit\n");
}
SDMAQueue::SDMAQueue(WDDMDevice *device,
void *ring,
uint64_t cmdbuf_size,
uint32_t engine,
bool use_hws) :
WDDMQueue(device, reinterpret_cast<uint64_t>(ring), cmdbuf_size, engine, use_hws),
wptr_next_(0),
wptr_pre_(0),
rptr_next(0),
thread_stop_(false),
ib_size(0),
ib_start_addr(0) {
bool ret = device->CreateQueue(this);
assert(ret);
thread_ = std::thread(SdmaThread, this);
}
SDMAQueue::~SDMAQueue() {
thread_cond_lock_.lock();
thread_stop_ = true;
thread_cond_lock_.unlock();
thread_cond_.notify_one();
thread_.join();
device->DestroyQueue(this);
}
void SDMAQueue::RingDoorbell() {
pr_debug("ringdoorbell %#lx %#lx\n", wptr_pre_, wptr_next_);
thread_cond_lock_.lock();
wptr_queue_.emplace_back(wptr_pre_, wptr_next_);
thread_cond_.notify_one();
thread_cond_lock_.unlock();
wptr_pre_ = wptr_next_;
}
hsa_status_t SDMAQueue::Init(void) {
hsa_status_t ret = use_hws ? HwsInit() : SwsInit();
if (ret)
return ret;
std::memset((char *)cmdbuf_addr, 0, cmdbuf_size);
return ret;
}
hsa_status_t SDMAQueue::Fini(void) {
return use_hws ? HwsFini() : SwsFini();
}
int SDMAQueue::PreparePacket(uint32_t offset, uint64_t size) {
ib_start_addr = cmdbuf_addr + offset;
ib_size = size;
rptr_next += ib_size;
return STATUS_SUCCESS;
}
hsa_status_t SDMAQueue::Submit(void) {
if (!device->WaitPagingFence(this))
return HSA_STATUS_ERROR;
int ret = use_hws ?
HwsSubmit(ib_start_addr, ib_size, rptr_next) :
SwsSubmit(ib_start_addr, ib_size, rptr_next);
if (ret)
return HSA_STATUS_ERROR;
return HSA_STATUS_SUCCESS;
}
} // namespace thunk
} // namespace wsl