wsl/hsakmt: add sdma queue implementation

Signed-off-by: Flora Cui <flora.cui@amd.com>
Reviewed-by: Shane Xiao <shane.xiao@amd.com>
Part-of: <http://10.67.69.192/wsl/libhsakmt/-/merge_requests/17>
Este commit está contenido en:
Flora Cui
2024-09-11 14:21:54 +08:00
cometido por Frank Min
padre e06e9b1d57
commit ab8771ccae
Se han modificado 4 ficheros con 218 adiciones y 29 borrados
+85 -11
Ver fichero
@@ -44,6 +44,9 @@
#include <cinttypes>
#include <condition_variable>
#include <iostream>
#include <queue>
#include <utility>
#include "inc/wddm/types.h"
#include "inc/wddm/device.h"
#include "inc/wddm/gpu_memory.h"
@@ -61,6 +64,7 @@ class WDDMDevice;
class WDDMQueue {
public:
WDDMQueue(WDDMDevice *device,
uint64_t cmdbuf_addr,
uint32_t cmdbuf_size,
uint32_t engine,
bool use_hws = true) :
@@ -70,7 +74,7 @@ public:
syncobj(NULL),
sync_addr(NULL),
cmdbuf(0),
cmdbuf_addr(0),
cmdbuf_addr(cmdbuf_addr),
cmdbuf_size(cmdbuf_size),
queue_engine(engine),
use_hws(use_hws),
@@ -80,8 +84,9 @@ public:
virtual ~WDDMQueue() { }
virtual hsa_status_t Init(void) = 0;
virtual hsa_status_t Fini(void) = 0;
virtual hsa_status_t Init(void) { return HSA_STATUS_SUCCESS; }
virtual hsa_status_t Fini(void) { return HSA_STATUS_SUCCESS; }
virtual void RingDoorbell() { }
hsa_status_t SwsInit(void);
hsa_status_t SwsFini(void);
@@ -250,17 +255,12 @@ private:
class SDMAQueue : public WDDMQueue {
public:
SDMAQueue(WDDMDevice *device,
void *ring,
uint64_t cmdbuf_size,
uint32_t engine,
bool use_hws = true) :
WDDMQueue(device, cmdbuf_size, engine, use_hws),
rptr_next(0),
ib_size(0),
ib_start_addr(0) {
bool use_hws = true);
}
virtual ~SDMAQueue() { }
virtual ~SDMAQueue();
hsa_status_t Init(void);
hsa_status_t Fini(void);
@@ -272,10 +272,84 @@ public:
device->CpuWait(&syncobj, &rptr_next, 1, false);
}
uint64_t * GetRingWptr(void) { return &wptr_next_; }
uint64_t * GetRingRptr(void) { return WDDMQueue::GetSyncAddr(); }
uint64_t * GetDoorbellPtr() { return &doorbell_; }
void RingDoorbell();
private:
uint64_t wptr_next_;
uint64_t wptr_pre_;
uint64_t rptr_next;
uint64_t doorbell_;
std::queue<std::pair<uint64_t, uint64_t>> wptr_queue_;
uint64_t ib_size;
uint64_t ib_start_addr;
std::thread thread_;
bool thread_stop_;
std::mutex thread_cond_lock_;
std::condition_variable thread_cond_;
static void SdmaThread(SDMAQueue *queue);
struct SDMA_PKT_POLL_REGMEM {
union {
struct {
unsigned int op : 8;
unsigned int sub_op : 8;
unsigned int reserved_0 : 10;
unsigned int hdp_flush : 1;
unsigned int reserved_1 : 1;
unsigned int func : 3;
unsigned int mem_poll : 1;
};
unsigned int DW_0_DATA;
} HEADER_UNION;
union {
struct {
unsigned int addr_31_0 : 32;
};
unsigned int DW_1_DATA;
} ADDR_LO_UNION;
union {
struct {
unsigned int addr_63_32 : 32;
};
unsigned int DW_2_DATA;
} ADDR_HI_UNION;
union {
struct {
unsigned int value : 32;
};
unsigned int DW_3_DATA;
} VALUE_UNION;
union {
struct {
unsigned int mask : 32;
};
unsigned int DW_4_DATA;
} MASK_UNION;
union {
struct {
unsigned int interval : 16;
unsigned int retry_count : 12;
unsigned int reserved_0 : 4;
};
unsigned int DW_5_DATA;
} DW5_UNION;
};
const unsigned int SDMA_OP_POLL_REGMEM = 8;
bool IsPollPacket(SDMA_PKT_POLL_REGMEM* pkt) {
return pkt->HEADER_UNION.op == SDMA_OP_POLL_REGMEM &&
pkt->HEADER_UNION.mem_poll == 1 &&
pkt->HEADER_UNION.func == 3;
}
uint32_t WrapIntoRocrRing(uint64_t idx) { return (idx & (cmdbuf_size - 1)); }
};
} // namespace thunk
+17 -5
Ver fichero
@@ -63,15 +63,27 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(
bool use_hws = device_->IsHwsEnabled(queue_engine);
auto queue_ = new wsl::thunk::ComputeQueue(
device_, QueueAddress, pkg_num,
reinterpret_cast<std::atomic<uint64_t> *>(QueueResource->Queue_write_ptr_aql),
reinterpret_cast<std::atomic<uint64_t> *>(QueueResource->Queue_read_ptr_aql),
reinterpret_cast<std::atomic<uint64_t> *>(
QueueResource->Queue_write_ptr_aql),
reinterpret_cast<std::atomic<uint64_t> *>(
QueueResource->Queue_read_ptr_aql),
QueueResource->ErrorReason, cmdbuf_size, queue_engine, use_hws);
QueueResource->QueueId = reinterpret_cast<HSA_QUEUEID>(queue_);
// for doorbell_signal.hardware_doorbell_ptr
QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr();
} break;
case HSA_QUEUE_SDMA:
case HSA_QUEUE_SDMA: {
uint32_t queue_engine = device_->GetSdmaEngine(0); // TODO:
bool use_hws = device_->IsHwsEnabled(queue_engine);
auto queue_ = new wsl::thunk::SDMAQueue(
device_, QueueAddress, QueueSizeInBytes,
queue_engine, use_hws);
QueueResource->QueueId = reinterpret_cast<HSA_QUEUEID>(queue_);
QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr();
QueueResource->Queue_write_ptr_aql = queue_->GetRingWptr();
QueueResource->Queue_read_ptr_aql = queue_->GetRingRptr();
} break;
default:
assert(false);
QueueResource->QueueId = 0;
@@ -101,7 +113,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(
HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId) {
CHECK_DXG_OPEN();
auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
auto queue_ = reinterpret_cast<wsl::thunk::WDDMQueue *>(QueueId);
if (!queue_)
return HSAKMT_STATUS_INVALID_PARAMETER;
@@ -165,7 +177,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWS(HSA_QUEUEID QueueId, HSAuint32 nGWS,
HSAKMT_STATUS HSAKMTAPI hsaKmtQueueRingDoorbell(HSA_QUEUEID QueueId) {
CHECK_DXG_OPEN();
auto queue_ = reinterpret_cast<wsl::thunk::ComputeQueue *>(QueueId);
auto queue_ = reinterpret_cast<wsl::thunk::WDDMQueue *>(QueueId);
if (!queue_)
return HSAKMT_STATUS_INVALID_PARAMETER;
+11 -9
Ver fichero
@@ -739,17 +739,19 @@ bool WDDMDevice::CreateQueue(WDDMQueue *queue) {
if (!CreateContext(queue->queue_engine, &queue->context))
return false;
GpuMemoryCreateInfo create_info{};
create_info.size = queue->cmdbuf_size;
create_info.domain = thunk_proxy::kSystem;
GpuMemory *gpu_mem = nullptr;
auto code = CreateGpuMemory(create_info, &gpu_mem);
if (code != ErrorCode::Success)
goto err_out0;
if (queue->cmdbuf_addr == 0) {
GpuMemoryCreateInfo create_info{};
create_info.size = queue->cmdbuf_size;
create_info.domain = thunk_proxy::kSystem;
queue->cmdbuf = gpu_mem->GetGpuMemoryHandle();
queue->cmdbuf_addr = gpu_mem->GpuAddress();
auto code = CreateGpuMemory(create_info, &gpu_mem);
if (code != ErrorCode::Success)
goto err_out0;
queue->cmdbuf = gpu_mem->GetGpuMemoryHandle();
queue->cmdbuf_addr = gpu_mem->GpuAddress();
}
if (queue->Init())
goto err_out1;
+105 -4
Ver fichero
@@ -222,7 +222,7 @@ ComputeQueue::ComputeQueue(WDDMDevice *device,
uint32_t cmdbuf_size,
uint32_t engine,
bool use_hws) :
WDDMQueue(device, cmdbuf_size, engine, use_hws),
WDDMQueue(device, 0, cmdbuf_size, engine, use_hws),
ring(ring),
ring_size(ring_size),
ring_wptr(ring_wptr),
@@ -240,9 +240,8 @@ ComputeQueue::ComputeQueue(WDDMDevice *device,
scratch_size_per_wave_(0),
scratch_size_(0),
scratch_base_(nullptr) {
bool ret = device->CreateQueue(this);
assert(ret);
bool ret = device->CreateQueue(this);
assert(ret);
GpuMemoryCreateInfo create_info{};
create_info.size = PAGE_SIZE;
@@ -950,6 +949,108 @@ hsa_status_t ComputeQueue::Process(void) {
return HSA_STATUS_SUCCESS;
}
void SDMAQueue::SdmaThread(SDMAQueue *queue) {
// This timing system is used for sleeping this Thread
// when one packet is invalid for about 2 seconds.
std::chrono::steady_clock::time_point start_time, time;
// Set the polling timeout value for 2 seconds
const std::chrono::milliseconds kMaxElapsed(2000);
bool sleep = false;
start_time = std::chrono::steady_clock::now();
while (true) {
if (!queue->wptr_queue_.empty()) {
uint64_t start = queue->wptr_queue_.front().first;
uint64_t end = queue->wptr_queue_.front().second;
queue->wptr_queue_.pop();
debug_print("SDMA: wptr %lx %lx\n", start, end);
SDMA_PKT_POLL_REGMEM* poll_pkt = reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(queue->cmdbuf_addr + queue->WrapIntoRocrRing(start));
SDMA_PKT_POLL_REGMEM* poll_next_pkt = poll_pkt + 1;
while (queue->IsPollPacket(poll_pkt) && queue->IsPollPacket(poll_next_pkt)) {
uint64_t poll_addr;
uint64_t poll_val;
if (poll_pkt->ADDR_LO_UNION.addr_31_0 > poll_next_pkt->ADDR_LO_UNION.addr_31_0) {
poll_addr = poll_next_pkt->ADDR_LO_UNION.addr_31_0 |
(uint64_t)poll_next_pkt->ADDR_HI_UNION.addr_63_32 << 32;
poll_val = poll_next_pkt->VALUE_UNION.value |
(uint64_t)poll_pkt->VALUE_UNION.value << 32;
} else {
poll_addr = poll_pkt->ADDR_LO_UNION.addr_31_0 |
(uint64_t)poll_pkt->ADDR_HI_UNION.addr_63_32 << 32;
poll_val = poll_pkt->VALUE_UNION.value |
(uint64_t)poll_next_pkt->VALUE_UNION.value << 32;
}
amd_signal_t* signal = (amd_signal_t*)((char*)poll_addr - offsetof(amd_signal_t, value));
uint64_t signal_handle = reinterpret_cast<uint64_t>(signal);
debug_print("SDMA: poll signal %#lx addr %#lx val %d\n", signal_handle, poll_addr, poll_val);
hsa_signal_t hsa_signal = {signal_handle};
hsa_signal_value_t value =
fn_hsa_signal_wait_relaxed(hsa_signal, HSA_SIGNAL_CONDITION_EQ, poll_val, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
assert(value == poll_val);
poll_pkt += 2;
poll_next_pkt += 2;
}
queue->PreparePacket(queue->WrapIntoRocrRing(start), end - start);
std::atomic_thread_fence(std::memory_order_release);
queue->Submit();
} else {
time = std::chrono::steady_clock::now();
if (time - start_time > kMaxElapsed)
sleep = true;
}
std::unique_lock<std::mutex> lock(queue->thread_cond_lock_);
if (sleep && queue->wptr_queue_.empty()) {
while (!queue->thread_stop_ && queue->wptr_queue_.empty()) {
queue->thread_cond_.wait(lock);
}
if (queue->thread_stop_)
break;
sleep = false;
start_time = std::chrono::steady_clock::now();
}
}
debug_print("sdma thread exit\n");
}
SDMAQueue::SDMAQueue(WDDMDevice *device,
void *ring,
uint64_t cmdbuf_size,
uint32_t engine,
bool use_hws) :
WDDMQueue(device, reinterpret_cast<uint64_t>(ring), cmdbuf_size, engine, use_hws),
wptr_next_(0),
wptr_pre_(0),
rptr_next(0),
thread_stop_(false),
ib_size(0),
ib_start_addr(0) {
bool ret = device->CreateQueue(this);
assert(ret);
thread_ = std::thread(SdmaThread, this);
}
SDMAQueue::~SDMAQueue() {
thread_cond_lock_.lock();
thread_stop_ = true;
thread_cond_lock_.unlock();
thread_cond_.notify_one();
thread_.join();
device->DestroyQueue(this);
}
void SDMAQueue::RingDoorbell() {
debug_print("SDMA: ringdoorbell %#llx %#llx\n", wptr_pre_, wptr_next_);
wptr_queue_.emplace(wptr_pre_, wptr_next_);
thread_cond_.notify_one();
wptr_pre_ = wptr_next_;
}
hsa_status_t SDMAQueue::Init(void) {
hsa_status_t ret = use_hws ? HwsInit() : SwsInit();
if (ret)