diff --git a/inc/wddm/queue.h b/inc/wddm/queue.h index ffddf3be60..6bf8c57dcc 100644 --- a/inc/wddm/queue.h +++ b/inc/wddm/queue.h @@ -44,6 +44,9 @@ #include #include +#include +#include +#include #include "inc/wddm/types.h" #include "inc/wddm/device.h" #include "inc/wddm/gpu_memory.h" @@ -61,6 +64,7 @@ class WDDMDevice; class WDDMQueue { public: WDDMQueue(WDDMDevice *device, + uint64_t cmdbuf_addr, uint32_t cmdbuf_size, uint32_t engine, bool use_hws = true) : @@ -70,7 +74,7 @@ public: syncobj(NULL), sync_addr(NULL), cmdbuf(0), - cmdbuf_addr(0), + cmdbuf_addr(cmdbuf_addr), cmdbuf_size(cmdbuf_size), queue_engine(engine), use_hws(use_hws), @@ -80,8 +84,9 @@ public: virtual ~WDDMQueue() { } - virtual hsa_status_t Init(void) = 0; - virtual hsa_status_t Fini(void) = 0; + virtual hsa_status_t Init(void) { return HSA_STATUS_SUCCESS; } + virtual hsa_status_t Fini(void) { return HSA_STATUS_SUCCESS; } + virtual void RingDoorbell() { } hsa_status_t SwsInit(void); hsa_status_t SwsFini(void); @@ -250,17 +255,12 @@ private: class SDMAQueue : public WDDMQueue { public: SDMAQueue(WDDMDevice *device, + void *ring, uint64_t cmdbuf_size, uint32_t engine, - bool use_hws = true) : - WDDMQueue(device, cmdbuf_size, engine, use_hws), - rptr_next(0), - ib_size(0), - ib_start_addr(0) { + bool use_hws = true); - } - - virtual ~SDMAQueue() { } + virtual ~SDMAQueue(); hsa_status_t Init(void); hsa_status_t Fini(void); @@ -272,10 +272,84 @@ public: device->CpuWait(&syncobj, &rptr_next, 1, false); } + uint64_t * GetRingWptr(void) { return &wptr_next_; } + uint64_t * GetRingRptr(void) { return WDDMQueue::GetSyncAddr(); } + uint64_t * GetDoorbellPtr() { return &doorbell_; } + void RingDoorbell(); + private: + uint64_t wptr_next_; + uint64_t wptr_pre_; uint64_t rptr_next; + uint64_t doorbell_; + std::queue> wptr_queue_; uint64_t ib_size; uint64_t ib_start_addr; + + std::thread thread_; + bool thread_stop_; + std::mutex thread_cond_lock_; + std::condition_variable thread_cond_; + static void SdmaThread(SDMAQueue *queue); + + struct SDMA_PKT_POLL_REGMEM { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved_0 : 10; + unsigned int hdp_flush : 1; + unsigned int reserved_1 : 1; + unsigned int func : 3; + unsigned int mem_poll : 1; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } ADDR_LO_UNION; + + union { + struct { + unsigned int addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } ADDR_HI_UNION; + + union { + struct { + unsigned int value : 32; + }; + unsigned int DW_3_DATA; + } VALUE_UNION; + + union { + struct { + unsigned int mask : 32; + }; + unsigned int DW_4_DATA; + } MASK_UNION; + + union { + struct { + unsigned int interval : 16; + unsigned int retry_count : 12; + unsigned int reserved_0 : 4; + }; + unsigned int DW_5_DATA; + } DW5_UNION; + }; + const unsigned int SDMA_OP_POLL_REGMEM = 8; + bool IsPollPacket(SDMA_PKT_POLL_REGMEM* pkt) { + return pkt->HEADER_UNION.op == SDMA_OP_POLL_REGMEM && + pkt->HEADER_UNION.mem_poll == 1 && + pkt->HEADER_UNION.func == 3; + } + uint32_t WrapIntoRocrRing(uint64_t idx) { return (idx & (cmdbuf_size - 1)); } }; } // namespace thunk diff --git a/queues.cpp b/queues.cpp index 0d94c833ac..ae8ee471b3 100644 --- a/queues.cpp +++ b/queues.cpp @@ -63,15 +63,27 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue( bool use_hws = device_->IsHwsEnabled(queue_engine); auto queue_ = new wsl::thunk::ComputeQueue( device_, QueueAddress, pkg_num, - reinterpret_cast *>(QueueResource->Queue_write_ptr_aql), - reinterpret_cast *>(QueueResource->Queue_read_ptr_aql), + reinterpret_cast *>( + QueueResource->Queue_write_ptr_aql), + reinterpret_cast *>( + QueueResource->Queue_read_ptr_aql), QueueResource->ErrorReason, cmdbuf_size, queue_engine, use_hws); QueueResource->QueueId = reinterpret_cast(queue_); // for doorbell_signal.hardware_doorbell_ptr QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr(); } break; - case HSA_QUEUE_SDMA: + case HSA_QUEUE_SDMA: { + uint32_t queue_engine = device_->GetSdmaEngine(0); // TODO: + bool use_hws = device_->IsHwsEnabled(queue_engine); + auto queue_ = new wsl::thunk::SDMAQueue( + device_, QueueAddress, QueueSizeInBytes, + queue_engine, use_hws); + QueueResource->QueueId = reinterpret_cast(queue_); + QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr(); + QueueResource->Queue_write_ptr_aql = queue_->GetRingWptr(); + QueueResource->Queue_read_ptr_aql = queue_->GetRingRptr(); + } break; default: assert(false); QueueResource->QueueId = 0; @@ -101,7 +113,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue( HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId) { CHECK_DXG_OPEN(); - auto queue_ = reinterpret_cast(QueueId); + auto queue_ = reinterpret_cast(QueueId); if (!queue_) return HSAKMT_STATUS_INVALID_PARAMETER; @@ -165,7 +177,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWS(HSA_QUEUEID QueueId, HSAuint32 nGWS, HSAKMT_STATUS HSAKMTAPI hsaKmtQueueRingDoorbell(HSA_QUEUEID QueueId) { CHECK_DXG_OPEN(); - auto queue_ = reinterpret_cast(QueueId); + auto queue_ = reinterpret_cast(QueueId); if (!queue_) return HSAKMT_STATUS_INVALID_PARAMETER; diff --git a/wddm/device.cpp b/wddm/device.cpp index cb9dd3b48e..8dfff4032a 100644 --- a/wddm/device.cpp +++ b/wddm/device.cpp @@ -739,17 +739,19 @@ bool WDDMDevice::CreateQueue(WDDMQueue *queue) { if (!CreateContext(queue->queue_engine, &queue->context)) return false; - GpuMemoryCreateInfo create_info{}; - create_info.size = queue->cmdbuf_size; - create_info.domain = thunk_proxy::kSystem; - GpuMemory *gpu_mem = nullptr; - auto code = CreateGpuMemory(create_info, &gpu_mem); - if (code != ErrorCode::Success) - goto err_out0; + if (queue->cmdbuf_addr == 0) { + GpuMemoryCreateInfo create_info{}; + create_info.size = queue->cmdbuf_size; + create_info.domain = thunk_proxy::kSystem; - queue->cmdbuf = gpu_mem->GetGpuMemoryHandle(); - queue->cmdbuf_addr = gpu_mem->GpuAddress(); + auto code = CreateGpuMemory(create_info, &gpu_mem); + if (code != ErrorCode::Success) + goto err_out0; + + queue->cmdbuf = gpu_mem->GetGpuMemoryHandle(); + queue->cmdbuf_addr = gpu_mem->GpuAddress(); + } if (queue->Init()) goto err_out1; diff --git a/wddm/queue.cpp b/wddm/queue.cpp index 1c3ccfae5c..28fcd40ffc 100644 --- a/wddm/queue.cpp +++ b/wddm/queue.cpp @@ -222,7 +222,7 @@ ComputeQueue::ComputeQueue(WDDMDevice *device, uint32_t cmdbuf_size, uint32_t engine, bool use_hws) : - WDDMQueue(device, cmdbuf_size, engine, use_hws), + WDDMQueue(device, 0, cmdbuf_size, engine, use_hws), ring(ring), ring_size(ring_size), ring_wptr(ring_wptr), @@ -240,9 +240,8 @@ ComputeQueue::ComputeQueue(WDDMDevice *device, scratch_size_per_wave_(0), scratch_size_(0), scratch_base_(nullptr) { - - bool ret = device->CreateQueue(this); - assert(ret); + bool ret = device->CreateQueue(this); + assert(ret); GpuMemoryCreateInfo create_info{}; create_info.size = PAGE_SIZE; @@ -950,6 +949,108 @@ hsa_status_t ComputeQueue::Process(void) { return HSA_STATUS_SUCCESS; } +void SDMAQueue::SdmaThread(SDMAQueue *queue) { + // This timing system is used for sleeping this Thread + // when one packet is invalid for about 2 seconds. + std::chrono::steady_clock::time_point start_time, time; + // Set the polling timeout value for 2 seconds + const std::chrono::milliseconds kMaxElapsed(2000); + bool sleep = false; + start_time = std::chrono::steady_clock::now(); + + while (true) { + if (!queue->wptr_queue_.empty()) { + uint64_t start = queue->wptr_queue_.front().first; + uint64_t end = queue->wptr_queue_.front().second; + queue->wptr_queue_.pop(); + debug_print("SDMA: wptr %lx %lx\n", start, end); + + SDMA_PKT_POLL_REGMEM* poll_pkt = reinterpret_cast(queue->cmdbuf_addr + queue->WrapIntoRocrRing(start)); + SDMA_PKT_POLL_REGMEM* poll_next_pkt = poll_pkt + 1; + while (queue->IsPollPacket(poll_pkt) && queue->IsPollPacket(poll_next_pkt)) { + uint64_t poll_addr; + uint64_t poll_val; + if (poll_pkt->ADDR_LO_UNION.addr_31_0 > poll_next_pkt->ADDR_LO_UNION.addr_31_0) { + poll_addr = poll_next_pkt->ADDR_LO_UNION.addr_31_0 | + (uint64_t)poll_next_pkt->ADDR_HI_UNION.addr_63_32 << 32; + poll_val = poll_next_pkt->VALUE_UNION.value | + (uint64_t)poll_pkt->VALUE_UNION.value << 32; + } else { + poll_addr = poll_pkt->ADDR_LO_UNION.addr_31_0 | + (uint64_t)poll_pkt->ADDR_HI_UNION.addr_63_32 << 32; + poll_val = poll_pkt->VALUE_UNION.value | + (uint64_t)poll_next_pkt->VALUE_UNION.value << 32; + } + amd_signal_t* signal = (amd_signal_t*)((char*)poll_addr - offsetof(amd_signal_t, value)); + uint64_t signal_handle = reinterpret_cast(signal); + debug_print("SDMA: poll signal %#lx addr %#lx val %d\n", signal_handle, poll_addr, poll_val); + hsa_signal_t hsa_signal = {signal_handle}; + hsa_signal_value_t value = + fn_hsa_signal_wait_relaxed(hsa_signal, HSA_SIGNAL_CONDITION_EQ, poll_val, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + assert(value == poll_val); + + poll_pkt += 2; + poll_next_pkt += 2; + } + queue->PreparePacket(queue->WrapIntoRocrRing(start), end - start); + std::atomic_thread_fence(std::memory_order_release); + queue->Submit(); + } else { + time = std::chrono::steady_clock::now(); + if (time - start_time > kMaxElapsed) + sleep = true; + } + + std::unique_lock lock(queue->thread_cond_lock_); + if (sleep && queue->wptr_queue_.empty()) { + while (!queue->thread_stop_ && queue->wptr_queue_.empty()) { + queue->thread_cond_.wait(lock); + } + if (queue->thread_stop_) + break; + sleep = false; + start_time = std::chrono::steady_clock::now(); + } + } + debug_print("sdma thread exit\n"); +} + +SDMAQueue::SDMAQueue(WDDMDevice *device, + void *ring, + uint64_t cmdbuf_size, + uint32_t engine, + bool use_hws) : + WDDMQueue(device, reinterpret_cast(ring), cmdbuf_size, engine, use_hws), + wptr_next_(0), + wptr_pre_(0), + rptr_next(0), + thread_stop_(false), + ib_size(0), + ib_start_addr(0) { + bool ret = device->CreateQueue(this); + assert(ret); + + thread_ = std::thread(SdmaThread, this); +} + +SDMAQueue::~SDMAQueue() { + thread_cond_lock_.lock(); + thread_stop_ = true; + thread_cond_lock_.unlock(); + thread_cond_.notify_one(); + thread_.join(); + + device->DestroyQueue(this); +} + +void SDMAQueue::RingDoorbell() { + debug_print("SDMA: ringdoorbell %#llx %#llx\n", wptr_pre_, wptr_next_); + + wptr_queue_.emplace(wptr_pre_, wptr_next_); + thread_cond_.notify_one(); + wptr_pre_ = wptr_next_; +} + hsa_status_t SDMAQueue::Init(void) { hsa_status_t ret = use_hws ? HwsInit() : SwsInit(); if (ret)