rocr/aie: AIE Queue Processing
Change-Id: I681c971ba7229037ca85d5529838aa7bbe5820e2
[ROCm/ROCR-Runtime commit: e9cc839b2b]
Этот коммит содержится в:
коммит произвёл
David Yat Sin
родитель
a317650289
Коммит
8ea388af92
+252
-23
@@ -117,7 +117,9 @@ hsa_status_t XdnaDriver::GetAgentProperties(core::Agent &agent) const {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
aie_agent.SetNumCols(aie_metadata.cols);
|
||||
// Right now can only target N-1 columns as that is the
|
||||
// number of shim DMAs in npu1 devices.
|
||||
aie_agent.SetNumCols(aie_metadata.cols - 1);
|
||||
aie_agent.SetNumCoreRows(aie_metadata.core.row_count);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
@@ -147,7 +149,7 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
|
||||
}
|
||||
|
||||
if (m_region.kernarg()) {
|
||||
create_bo_args.type = AMDXDNA_BO_CMD;
|
||||
create_bo_args.type = AMDXDNA_BO_SHMEM;
|
||||
} else {
|
||||
create_bo_args.type = AMDXDNA_BO_DEV;
|
||||
}
|
||||
@@ -189,11 +191,26 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
|
||||
}
|
||||
|
||||
vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem);
|
||||
vmem_addr_mappings.emplace(mapped_mem, create_bo_args.handle);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
|
||||
auto it = vmem_addr_mappings.find(mem);
|
||||
if (it == vmem_addr_mappings.end()) return HSA_STATUS_ERROR_INVALID_ALLOCATION;
|
||||
|
||||
auto handle = it->second;
|
||||
|
||||
drm_gem_close close_args = {};
|
||||
close_args.handle = handle;
|
||||
if (ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_args) < 0) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
vmem_handle_mappings.erase(handle);
|
||||
vmem_addr_mappings.erase(it);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -207,18 +224,10 @@ hsa_status_t XdnaDriver::CreateQueue(core::Queue &queue) const {
|
||||
|
||||
// Currently we do not leverage QoS information.
|
||||
amdxdna_qos_info qos_info{0};
|
||||
amdxdna_drm_create_hwctx create_hwctx_args{
|
||||
.ext = 0,
|
||||
.ext_flags = 0,
|
||||
.qos_p = reinterpret_cast<uintptr_t>(&qos_info),
|
||||
.umq_bo = 0,
|
||||
.log_buf_bo = 0,
|
||||
// TODO: Make this configurable.
|
||||
.max_opc = 0x800,
|
||||
// This field is for the number of core tiles.
|
||||
.num_tiles = aie_agent.GetNumCores(),
|
||||
.mem_size = 0,
|
||||
.umq_doorbell = 0};
|
||||
amdxdna_drm_create_hwctx create_hwctx_args = {};
|
||||
create_hwctx_args.qos_p = reinterpret_cast<uintptr_t>(&qos_info);
|
||||
create_hwctx_args.max_opc = 0x800;
|
||||
create_hwctx_args.num_tiles = static_cast<uint32_t>(aie_agent.GetNumCores());
|
||||
|
||||
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &create_hwctx_args) < 0) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
@@ -261,13 +270,9 @@ hsa_status_t XdnaDriver::QueryDriverVersion() {
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::InitDeviceHeap() {
|
||||
amdxdna_drm_create_bo create_bo_args{
|
||||
.flags = 0,
|
||||
.type = AMDXDNA_BO_DEV_HEAP,
|
||||
._pad = 0,
|
||||
.vaddr = reinterpret_cast<uintptr_t>(nullptr),
|
||||
.size = dev_heap_size,
|
||||
.handle = 0};
|
||||
amdxdna_drm_create_bo create_bo_args = {};
|
||||
create_bo_args.size = dev_heap_size;
|
||||
create_bo_args.type = AMDXDNA_BO_DEV_HEAP;
|
||||
|
||||
amdxdna_drm_get_bo_info get_bo_info_args{0};
|
||||
drm_gem_close close_bo_args{0};
|
||||
@@ -316,19 +321,243 @@ hsa_status_t XdnaDriver::InitDeviceHeap() {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
std::unordered_map<uint32_t, void*>& XdnaDriver::GetHandleMappings() {
|
||||
return vmem_handle_mappings;
|
||||
}
|
||||
|
||||
std::unordered_map<void*, uint32_t>& XdnaDriver::GetAddrMappings() { return vmem_addr_mappings; }
|
||||
|
||||
hsa_status_t XdnaDriver::FreeDeviceHeap() {
|
||||
if (dev_heap_parent) {
|
||||
munmap(dev_heap_parent, dev_heap_align * 2 - 1);
|
||||
if (munmap(dev_heap_parent, dev_heap_align * 2 - 1) != 0) return HSA_STATUS_ERROR;
|
||||
dev_heap_parent = nullptr;
|
||||
}
|
||||
|
||||
if (dev_heap_aligned) {
|
||||
munmap(dev_heap_aligned, dev_heap_size);
|
||||
if (munmap(dev_heap_aligned, dev_heap_size) != 0) return HSA_STATUS_ERROR;
|
||||
dev_heap_aligned = nullptr;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::SyncBos(const std::vector<uint64_t>& bo_addrs,
|
||||
const std::vector<uint32_t>& bo_sizes) {
|
||||
if (bo_addrs.size() != bo_sizes.size()) return HSA_STATUS_ERROR;
|
||||
|
||||
for (int i = 0; i < bo_addrs.size(); i++) {
|
||||
FlushCpuCache(reinterpret_cast<void*>(bo_addrs[i]), 0, bo_sizes[i]);
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::ExecCmdAndWait(amdxdna_drm_exec_cmd* exec_cmd, uint32_t hw_ctx_handle) {
|
||||
// Submit the cmd
|
||||
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_EXEC_CMD, exec_cmd)) return HSA_STATUS_ERROR;
|
||||
|
||||
// Waiting for command to finish
|
||||
amdxdna_drm_wait_cmd wait_cmd = {};
|
||||
wait_cmd.hwctx = hw_ctx_handle;
|
||||
wait_cmd.timeout = DEFAULT_TIMEOUT_VAL;
|
||||
wait_cmd.seq = exec_cmd->seq;
|
||||
|
||||
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd)) return HSA_STATUS_ERROR;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::RegisterCmdBOs(
|
||||
uint32_t count, std::vector<uint32_t>& bo_args, std::vector<uint32_t>& bo_sizes,
|
||||
std::vector<uint64_t>& bo_addrs, hsa_amd_aie_ert_start_kernel_data_t* cmd_pkt_payload,
|
||||
const std::unordered_map<void*, uint32_t>& vmem_addr_mappings) {
|
||||
// This is the index where the operand addresses start in a command
|
||||
const int operand_starting_index = 5;
|
||||
|
||||
// Counting the number of operands in the command payload.
|
||||
uint32_t num_operands = GetOperandCount(count);
|
||||
|
||||
uint64_t instr_addr = Concat<uint64_t, uint32_t>(
|
||||
cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX + 1],
|
||||
cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]);
|
||||
auto instr_handle = vmem_addr_mappings.find(reinterpret_cast<void*>(instr_addr));
|
||||
|
||||
if (instr_handle == vmem_addr_mappings.end()) return HSA_STATUS_ERROR;
|
||||
|
||||
// Keep track of the handles and addresses before we submit the packet
|
||||
bo_args.push_back(instr_handle->second);
|
||||
bo_addrs.push_back(instr_addr);
|
||||
|
||||
// Adding the instruction sequence size. The packet contains the number of
|
||||
// instructions.
|
||||
uint32_t instr_bo_size =
|
||||
cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_SIZE_IDX] * INSTR_SIZE_BYTES;
|
||||
bo_sizes.push_back(instr_bo_size);
|
||||
|
||||
// Going through all of the operands in the command, keeping track of the
|
||||
// addresses and turning the addresses into handles. The starting index of
|
||||
// the operands in a command is `operand_starting_index` and the fields
|
||||
// are 32-bits we need to iterate over every two
|
||||
for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) {
|
||||
uint32_t operand_index = operand_starting_index + 2 * operand_iter;
|
||||
uint64_t operand_addr = Concat<uint64_t, uint32_t>(cmd_pkt_payload->data[operand_index + 1],
|
||||
cmd_pkt_payload->data[operand_index]);
|
||||
auto operand_handle = vmem_addr_mappings.find(reinterpret_cast<void*>(operand_addr));
|
||||
if (operand_handle == vmem_addr_mappings.end()) return HSA_STATUS_ERROR;
|
||||
bo_args.push_back(operand_handle->second);
|
||||
bo_addrs.push_back(operand_addr);
|
||||
}
|
||||
|
||||
// Going through all of the operands in the command, keeping track of
|
||||
// the sizes of each operand. The size is used to sync the buffer
|
||||
uint32_t operand_size_starting_index = operand_starting_index + 2 * num_operands;
|
||||
for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) {
|
||||
bo_sizes.push_back(cmd_pkt_payload->data[operand_size_starting_index + operand_iter]);
|
||||
}
|
||||
|
||||
// Transform the instruction sequence address into device address
|
||||
cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX] =
|
||||
DEV_ADDR_BASE | instr_addr & DEV_ADDR_OFFSET_MASK;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::CreateCmd(uint32_t size, uint32_t* handle, amdxdna_cmd** cmd) {
|
||||
// Creating the command
|
||||
amdxdna_drm_create_bo create_cmd_bo = {};
|
||||
create_cmd_bo.type = AMDXDNA_BO_CMD, create_cmd_bo.size = size;
|
||||
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo)) return HSA_STATUS_ERROR;
|
||||
|
||||
amdxdna_drm_get_bo_info cmd_bo_get_bo_info = {};
|
||||
cmd_bo_get_bo_info.handle = create_cmd_bo.handle;
|
||||
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_get_bo_info)) return HSA_STATUS_ERROR;
|
||||
|
||||
*cmd = static_cast<amdxdna_cmd*>(mmap(nullptr, create_cmd_bo.size, PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED, fd_, cmd_bo_get_bo_info.map_offset));
|
||||
|
||||
if (cmd == MAP_FAILED) return HSA_STATUS_ERROR;
|
||||
|
||||
*handle = create_cmd_bo.handle;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uint32_t num_pkts,
|
||||
uint32_t num_operands, uint32_t hw_ctx_handle) {
|
||||
// Storing the metadata of the BOs that store the operands and metadata
|
||||
// of the commands we are going to submit
|
||||
std::vector<uint32_t> bo_args;
|
||||
std::vector<uint32_t> bo_sizes;
|
||||
std::vector<uint64_t> bo_addrs;
|
||||
bo_args.reserve(num_operands);
|
||||
bo_sizes.reserve(num_operands);
|
||||
bo_addrs.reserve(num_operands);
|
||||
|
||||
// Storing the commands that we are going to submit and the
|
||||
// corresponding metadata
|
||||
std::vector<uint32_t> cmd_handles;
|
||||
std::vector<uint32_t> cmd_sizes;
|
||||
std::vector<amdxdna_cmd*> cmds;
|
||||
cmd_handles.reserve(num_pkts);
|
||||
cmd_sizes.reserve(num_pkts);
|
||||
cmds.reserve(num_pkts);
|
||||
|
||||
// Iterating over all the contiguous HSA_AMD_AIE_ERT_CMD_CHAIN packets
|
||||
for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
|
||||
// Getting the current command packet
|
||||
hsa_amd_aie_ert_packet_t* pkt = first_pkt + pkt_iter;
|
||||
hsa_amd_aie_ert_start_kernel_data_t* cmd_pkt_payload =
|
||||
reinterpret_cast<hsa_amd_aie_ert_start_kernel_data_t*>(pkt->payload_data);
|
||||
|
||||
// Add the handles for all of the BOs to bo_args as well as rewrite
|
||||
// the command payload handles to contain the actual virtual addresses
|
||||
if (RegisterCmdBOs(pkt->count, bo_args, bo_sizes, bo_addrs, cmd_pkt_payload,
|
||||
vmem_addr_mappings) != HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
// Creating a packet that contains the command to execute the kernel
|
||||
uint32_t cmd_bo_handle = 0;
|
||||
amdxdna_cmd* cmd = nullptr;
|
||||
uint32_t cmd_size = sizeof(amdxdna_cmd) + pkt->count * sizeof(uint32_t);
|
||||
if (CreateCmd(cmd_size, &cmd_bo_handle, &cmd)) return HSA_STATUS_ERROR;
|
||||
|
||||
// Filling in the fields of the command
|
||||
cmd->state = pkt->state;
|
||||
cmd->extra_cu_masks = 0;
|
||||
|
||||
// The driver places a structure before each command in a command chain.
|
||||
// Need to increase the size of the command by the size of this structure.
|
||||
cmd->count = pkt->count + CMD_COUNT_SIZE_INCREASE;
|
||||
cmd->opcode = pkt->opcode;
|
||||
cmd->data[0] = cmd_pkt_payload->cu_mask;
|
||||
memcpy((cmd->data + 1), cmd_pkt_payload->data, 4 * pkt->count);
|
||||
|
||||
// Keeping track of the handle
|
||||
cmd_handles.push_back(cmd_bo_handle);
|
||||
cmds.push_back(cmd);
|
||||
cmd_sizes.push_back(cmd_size);
|
||||
}
|
||||
|
||||
// Creating a packet that contains the command chain
|
||||
uint32_t cmd_chain_bo_handle = 0;
|
||||
amdxdna_cmd* cmd_chain = nullptr;
|
||||
int cmd_chain_size = (cmd_handles.size() + 1) * sizeof(uint32_t);
|
||||
if (CreateCmd(cmd_chain_size, &cmd_chain_bo_handle, &cmd_chain)) return HSA_STATUS_ERROR;
|
||||
|
||||
// Writing information to the command buffer
|
||||
amdxdna_cmd_chain* cmd_chain_payload = reinterpret_cast<amdxdna_cmd_chain*>(cmd_chain->data);
|
||||
|
||||
// Creating a command chain
|
||||
cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW;
|
||||
cmd_chain->extra_cu_masks = 0;
|
||||
cmd_chain->count = sizeof(amdxdna_cmd_chain) + cmd_handles.size() * sizeof(uint64_t);
|
||||
cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN;
|
||||
cmd_chain_payload->command_count = cmd_handles.size();
|
||||
cmd_chain_payload->submit_index = 0;
|
||||
cmd_chain_payload->error_index = 0;
|
||||
for (int i = 0; i < cmd_handles.size(); i++) {
|
||||
cmd_chain_payload->data[i] = cmd_handles[i];
|
||||
}
|
||||
|
||||
// Syncing BOs before we execute the command
|
||||
if (SyncBos(bo_addrs, bo_sizes)) return HSA_STATUS_ERROR;
|
||||
|
||||
// Removing duplicates in the bo container. The driver will report
|
||||
// an error if we provide the same BO handle multiple times.
|
||||
// This can happen if any of the BOs are the same across jobs
|
||||
std::sort(bo_args.begin(), bo_args.end());
|
||||
bo_args.erase(std::unique(bo_args.begin(), bo_args.end()), bo_args.end());
|
||||
|
||||
// Filling in the fields to execute the command chain
|
||||
amdxdna_drm_exec_cmd exec_cmd_0 = {};
|
||||
exec_cmd_0.hwctx = hw_ctx_handle;
|
||||
exec_cmd_0.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF;
|
||||
exec_cmd_0.cmd_handles = cmd_chain_bo_handle;
|
||||
exec_cmd_0.args = reinterpret_cast<uint64_t>(bo_args.data());
|
||||
exec_cmd_0.cmd_count = 1;
|
||||
exec_cmd_0.arg_count = bo_args.size();
|
||||
|
||||
// Executing all commands in the command chain
|
||||
ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle);
|
||||
|
||||
// Unmapping and closing the cmd BOs
|
||||
drm_gem_close close_bo_args{0};
|
||||
for (int i = 0; i < cmd_handles.size(); i++) {
|
||||
if (munmap(cmds[i], cmd_sizes[i]) != 0) return HSA_STATUS_ERROR;
|
||||
close_bo_args.handle = cmd_handles[i];
|
||||
ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
|
||||
}
|
||||
|
||||
// Unmapping and closing the cmd_chain BO
|
||||
if (munmap(cmd_chain, cmd_chain_size) != 0) return HSA_STATUS_ERROR;
|
||||
close_bo_args.handle = cmd_chain_bo_handle;
|
||||
ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
|
||||
|
||||
// Syncing BOs after we execute the command
|
||||
if (SyncBos(bo_addrs, bo_sizes)) return HSA_STATUS_ERROR;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace AMD
|
||||
} // namespace rocr
|
||||
|
||||
+55
-90
@@ -21,7 +21,6 @@ extern "C" {
|
||||
#define AMDXDNA_DRIVER_MAJOR 1
|
||||
#define AMDXDNA_DRIVER_MINOR 0
|
||||
|
||||
#define AMDXDNA_INVALID_CMD_HANDLE (~0UL)
|
||||
#define AMDXDNA_INVALID_ADDR (~0UL)
|
||||
#define AMDXDNA_INVALID_CTX_HANDLE 0
|
||||
#define AMDXDNA_INVALID_BO_HANDLE 0
|
||||
@@ -50,11 +49,9 @@ enum amdxdna_drm_ioctl_id {
|
||||
DRM_AMDXDNA_GET_BO_INFO,
|
||||
DRM_AMDXDNA_SYNC_BO,
|
||||
DRM_AMDXDNA_EXEC_CMD,
|
||||
DRM_AMDXDNA_WAIT_CMD,
|
||||
DRM_AMDXDNA_GET_INFO,
|
||||
DRM_AMDXDNA_SET_STATE,
|
||||
DRM_AMDXDNA_SUBMIT_WAIT,
|
||||
DRM_AMDXDNA_SUBMIT_SIGNAL,
|
||||
DRM_AMDXDNA_WAIT_CMD,
|
||||
DRM_AMDXDNA_NUM_IOCTLS
|
||||
};
|
||||
|
||||
@@ -96,6 +93,7 @@ struct amdxdna_qos_info {
|
||||
* @mem_size: Size of AIE tile memory.
|
||||
* @umq_doorbell: Returned offset of doorbell associated with UMQ.
|
||||
* @handle: Returned hardware context handle.
|
||||
* @pad: Structure padding.
|
||||
*/
|
||||
struct amdxdna_drm_create_hwctx {
|
||||
__u64 ext;
|
||||
@@ -108,12 +106,13 @@ struct amdxdna_drm_create_hwctx {
|
||||
__u32 mem_size;
|
||||
__u32 umq_doorbell;
|
||||
__u32 handle;
|
||||
__u32 pad;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct amdxdna_drm_destroy_hwctx - Destroy hardware context.
|
||||
* @handle: Hardware context handle.
|
||||
* @pad: MBZ.
|
||||
* @pad: Structure padding.
|
||||
*/
|
||||
struct amdxdna_drm_destroy_hwctx {
|
||||
__u32 handle;
|
||||
@@ -122,9 +121,9 @@ struct amdxdna_drm_destroy_hwctx {
|
||||
|
||||
/**
|
||||
* struct amdxdna_cu_config - configuration for one CU
|
||||
* @cu_bo: CU configuration buffer bo handle
|
||||
* @cu_func: Functional of a CU
|
||||
* @pad: MBZ
|
||||
* @cu_bo: CU configuration buffer bo handle.
|
||||
* @cu_func: Function of a CU.
|
||||
* @pad: Structure padding.
|
||||
*/
|
||||
struct amdxdna_cu_config {
|
||||
__u32 cu_bo;
|
||||
@@ -135,9 +134,9 @@ struct amdxdna_cu_config {
|
||||
/**
|
||||
* struct amdxdna_hwctx_param_config_cu - configuration for CUs in hardware
|
||||
* context
|
||||
* @num_cus: Number of CUs to configure
|
||||
* @pad: MBZ
|
||||
* @cu_configs: Array of CU configurations of struct amdxdna_cu_config
|
||||
* @num_cus: Number of CUs to configure.
|
||||
* @pad: Structure padding.
|
||||
* @cu_configs: Array of CU configurations of struct amdxdna_cu_config.
|
||||
*/
|
||||
struct amdxdna_hwctx_param_config_cu {
|
||||
__u16 num_cus;
|
||||
@@ -160,6 +159,7 @@ enum amdxdna_drm_config_hwctx_param {
|
||||
* @param_val: A structure specified by the param_type struct member.
|
||||
* @param_val_size: Size of the parameter buffer pointed to by the param_val.
|
||||
* If param_val is not a pointer, driver can ignore this.
|
||||
* @pad: Structure padding.
|
||||
*
|
||||
* Note: if the param_val is a pointer pointing to a buffer, the maximum size
|
||||
* of the buffer is 4KiB(PAGE_SIZE).
|
||||
@@ -191,17 +191,16 @@ enum amdxdna_bo_type {
|
||||
/**
|
||||
* struct amdxdna_drm_create_bo - Create a buffer object.
|
||||
* @flags: Buffer flags. MBZ.
|
||||
* @type: Buffer type.
|
||||
* @vaddr: User VA of buffer if applied. MBZ.
|
||||
* @size: Size in bytes.
|
||||
* @type: Buffer type.
|
||||
* @handle: Returned DRM buffer object handle.
|
||||
*/
|
||||
struct amdxdna_drm_create_bo {
|
||||
__u64 flags;
|
||||
__u32 type;
|
||||
__u32 _pad;
|
||||
__u64 vaddr;
|
||||
__u64 size;
|
||||
__u32 type;
|
||||
__u32 handle;
|
||||
};
|
||||
|
||||
@@ -210,6 +209,7 @@ struct amdxdna_drm_create_bo {
|
||||
* @ext: MBZ.
|
||||
* @ext_flags: MBZ.
|
||||
* @handle: DRM buffer object handle.
|
||||
* @pad: Structure padding.
|
||||
* @map_offset: Returned DRM fake offset for mmap().
|
||||
* @vaddr: Returned user VA of buffer. 0 in case user needs mmap().
|
||||
* @xdna_addr: Returned XDNA device virtual address.
|
||||
@@ -218,7 +218,7 @@ struct amdxdna_drm_get_bo_info {
|
||||
__u64 ext;
|
||||
__u64 ext_flags;
|
||||
__u32 handle;
|
||||
__u32 _pad;
|
||||
__u32 pad;
|
||||
__u64 map_offset;
|
||||
__u64 vaddr;
|
||||
__u64 xdna_addr;
|
||||
@@ -252,8 +252,8 @@ enum amdxdna_cmd_type {
|
||||
* @ext_flags: MBZ.
|
||||
* @hwctx: Hardware context handle.
|
||||
* @type: One of command type in enum amdxdna_cmd_type.
|
||||
* @cmd_handles: Array of command handles or the command handle itself in case
|
||||
* of just one.
|
||||
* @cmd_handles: Array of command handles or the command handle itself
|
||||
* in case of just one.
|
||||
* @args: Array of arguments for all command handles.
|
||||
* @cmd_count: Number of command handles in the cmd_handles array.
|
||||
* @arg_count: Number of arguments in the args array.
|
||||
@@ -279,8 +279,6 @@ struct amdxdna_drm_exec_cmd {
|
||||
* @seq: sequence number of the command returned by execute command.
|
||||
*
|
||||
* Wait a command specified by seq to be completed.
|
||||
* Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot
|
||||
* to submit a new command.
|
||||
*/
|
||||
struct amdxdna_drm_wait_cmd {
|
||||
__u32 hwctx;
|
||||
@@ -290,10 +288,9 @@ struct amdxdna_drm_wait_cmd {
|
||||
|
||||
/**
|
||||
* struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware
|
||||
* @buffer: The user space buffer that will return the AIE status
|
||||
* @buffer_size: The size of the user space buffer
|
||||
* @cols_filled: A bitmap of AIE columns whose data has been returned in the
|
||||
* buffer.
|
||||
* @buffer: The user space buffer that will return the AIE status.
|
||||
* @buffer_size: The size of the user space buffer.
|
||||
* @cols_filled: A bitmap of AIE columns whose data has been returned in the buffer.
|
||||
*/
|
||||
struct amdxdna_drm_query_aie_status {
|
||||
__u64 buffer; /* out */
|
||||
@@ -303,8 +300,8 @@ struct amdxdna_drm_query_aie_status {
|
||||
|
||||
/**
|
||||
* struct amdxdna_drm_query_aie_version - Query the version of the AIE hardware
|
||||
* @major: The major version number
|
||||
* @minor: The minor version number
|
||||
* @major: The major version number.
|
||||
* @minor: The minor version number.
|
||||
*/
|
||||
struct amdxdna_drm_query_aie_version {
|
||||
__u32 major; /* out */
|
||||
@@ -319,7 +316,7 @@ struct amdxdna_drm_query_aie_version {
|
||||
* @dma_channel_count: The number of dma channels.
|
||||
* @lock_count: The number of locks.
|
||||
* @event_reg_count: The number of events.
|
||||
* @pad: MBZ.
|
||||
* @pad: Structure padding.
|
||||
*/
|
||||
struct amdxdna_drm_query_aie_tile_metadata {
|
||||
__u16 row_count;
|
||||
@@ -331,8 +328,7 @@ struct amdxdna_drm_query_aie_tile_metadata {
|
||||
};
|
||||
|
||||
/**
|
||||
* struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE
|
||||
* hardware
|
||||
* struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE hardware
|
||||
* @col_size: The size of a column in bytes.
|
||||
* @cols: The total number of columns.
|
||||
* @rows: The total number of rows.
|
||||
@@ -355,7 +351,7 @@ struct amdxdna_drm_query_aie_metadata {
|
||||
* struct amdxdna_drm_query_clock - Metadata for a clock
|
||||
* @name: The clock name.
|
||||
* @freq_mhz: The clock frequency.
|
||||
* @pad: MBZ.
|
||||
* @pad: Structure padding.
|
||||
*/
|
||||
struct amdxdna_drm_query_clock {
|
||||
__u8 name[16];
|
||||
@@ -381,14 +377,12 @@ enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER };
|
||||
* @input: The current value of the sensor.
|
||||
* @max: The maximum value possible for the sensor.
|
||||
* @average: The average value of the sensor.
|
||||
* @highest: The highest recorded sensor value for this driver load for the
|
||||
* sensor.
|
||||
* @highest: The highest recorded sensor value for this driver load for the sensor.
|
||||
* @status: The sensor status.
|
||||
* @units: The sensor units.
|
||||
* @unitm: Translates value member variables into the correct unit via (pow(10,
|
||||
* unitm) * value)
|
||||
* @type: The sensor type from enum amdxdna_sensor_type
|
||||
* @pad: MBZ.
|
||||
* @unitm: Translates value member variables into the correct unit via (pow(10, unitm) * value).
|
||||
* @type: The sensor type from enum amdxdna_sensor_type.
|
||||
* @pad: Structure padding.
|
||||
*/
|
||||
struct amdxdna_drm_query_sensor {
|
||||
__u8 label[64];
|
||||
@@ -408,14 +402,14 @@ struct amdxdna_drm_query_sensor {
|
||||
* @context_id: The ID for this context.
|
||||
* @start_col: The starting column for the partition assigned to this context.
|
||||
* @num_col: The number of columns in the partition assigned to this context.
|
||||
* @pad: Structure padding.
|
||||
* @pid: The Process ID of the process that created this context.
|
||||
* @command_submissions: The number of commands submitted to this context.
|
||||
* @command_completions: The number of commands completed by this context.
|
||||
* @migrations: The number of times this context has been moved to a different
|
||||
* partition.
|
||||
* @preemptions: The number of times this context has been preempted by another
|
||||
* context in the same partition.
|
||||
* @pad: MBZ.
|
||||
* @migrations: The number of times this context has been moved to a different partition.
|
||||
* @preemptions: The number of times this context has been preempted by another context in the
|
||||
* same partition.
|
||||
* @errors: The errors for this context.
|
||||
*/
|
||||
struct amdxdna_drm_query_hwctx {
|
||||
__u32 context_id;
|
||||
@@ -471,6 +465,7 @@ enum amdxdna_power_mode_type {
|
||||
POWER_MODE_LOW, /**< Set frequency to lowest DPM */
|
||||
POWER_MODE_MEDIUM, /**< Set frequency to medium DPM */
|
||||
POWER_MODE_HIGH, /**< Set frequency to highest DPM */
|
||||
POWER_MODE_TURBO, /**< More power, more performance */
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -508,13 +503,13 @@ enum amdxdna_drm_get_param {
|
||||
DRM_AMDXDNA_READ_AIE_REG,
|
||||
DRM_AMDXDNA_QUERY_FIRMWARE_VERSION,
|
||||
DRM_AMDXDNA_GET_POWER_MODE,
|
||||
DRM_AMDXDNA_QUERY_TELEMETRY,
|
||||
DRM_AMDXDNA_NUM_GET_PARAM,
|
||||
};
|
||||
|
||||
/**
|
||||
* struct amdxdna_drm_get_info - Get some information from the AIE hardware.
|
||||
* @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed
|
||||
* in the buffer.
|
||||
* @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed in the buffer.
|
||||
* @buffer_size: Size of the input buffer. Size needed/written by the kernel.
|
||||
* @buffer: A structure specified by the param struct member.
|
||||
*/
|
||||
@@ -542,10 +537,8 @@ enum amdxdna_drm_set_param {
|
||||
};
|
||||
|
||||
/**
|
||||
* struct amdxdna_drm_set_state - Set the state of some component within the AIE
|
||||
* hardware.
|
||||
* @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed
|
||||
* in the buffer.
|
||||
* struct amdxdna_drm_set_state - Set the state of some component within the AIE hardware.
|
||||
* @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed in the buffer.
|
||||
* @buffer_size: Size of the input buffer.
|
||||
* @buffer: A structure specified by the param struct member.
|
||||
*/
|
||||
@@ -555,63 +548,35 @@ struct amdxdna_drm_set_state {
|
||||
__u64 buffer; /* in */
|
||||
};
|
||||
|
||||
/**
|
||||
* struct amdxdna_drm_syncobjs - Signal or wait on array of DRM timelined sync
|
||||
* objects.
|
||||
* @handles: Array of handles of sync objects.
|
||||
* @points: Array of time points for each sync objects.
|
||||
* @count: Number of elements in the above array.
|
||||
*/
|
||||
struct amdxdna_drm_syncobjs {
|
||||
__u64 handles; /* in */
|
||||
__u64 points; /* in */
|
||||
__u32 count; /* in */
|
||||
__u32 pad;
|
||||
};
|
||||
#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, struct amdxdna_drm_create_hwctx)
|
||||
|
||||
#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \
|
||||
struct amdxdna_drm_create_hwctx)
|
||||
#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, struct amdxdna_drm_destroy_hwctx)
|
||||
|
||||
#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \
|
||||
struct amdxdna_drm_destroy_hwctx)
|
||||
#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, struct amdxdna_drm_config_hwctx)
|
||||
|
||||
#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \
|
||||
struct amdxdna_drm_config_hwctx)
|
||||
#define DRM_IOCTL_AMDXDNA_CREATE_BO \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, struct amdxdna_drm_create_bo)
|
||||
|
||||
#define DRM_IOCTL_AMDXDNA_CREATE_BO \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \
|
||||
struct amdxdna_drm_create_bo)
|
||||
#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, struct amdxdna_drm_get_bo_info)
|
||||
|
||||
#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \
|
||||
struct amdxdna_drm_get_bo_info)
|
||||
|
||||
#define DRM_IOCTL_AMDXDNA_SYNC_BO \
|
||||
#define DRM_IOCTL_AMDXDNA_SYNC_BO \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, struct amdxdna_drm_sync_bo)
|
||||
|
||||
#define DRM_IOCTL_AMDXDNA_EXEC_CMD \
|
||||
#define DRM_IOCTL_AMDXDNA_EXEC_CMD \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, struct amdxdna_drm_exec_cmd)
|
||||
|
||||
#define DRM_IOCTL_AMDXDNA_WAIT_CMD \
|
||||
#define DRM_IOCTL_AMDXDNA_WAIT_CMD \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, struct amdxdna_drm_wait_cmd)
|
||||
|
||||
#define DRM_IOCTL_AMDXDNA_GET_INFO \
|
||||
#define DRM_IOCTL_AMDXDNA_GET_INFO \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, struct amdxdna_drm_get_info)
|
||||
|
||||
#define DRM_IOCTL_AMDXDNA_SET_STATE \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \
|
||||
struct amdxdna_drm_set_state)
|
||||
|
||||
#define DRM_IOCTL_AMDXDNA_SUBMIT_WAIT \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_WAIT, \
|
||||
struct amdxdna_drm_syncobjs)
|
||||
|
||||
#define DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_SIGNAL, \
|
||||
struct amdxdna_drm_syncobjs)
|
||||
#define DRM_IOCTL_AMDXDNA_SET_STATE \
|
||||
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, struct amdxdna_drm_set_state)
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} /* extern c end */
|
||||
|
||||
@@ -100,6 +100,9 @@ public:
|
||||
return system_allocator_;
|
||||
}
|
||||
|
||||
/// @brief Getter for the AIE system deallocator.
|
||||
const std::function<void(void*)>& system_deallocator() const { return system_deallocator_; }
|
||||
|
||||
// AIE agent methods.
|
||||
/// @brief Get the number of columns on this AIE agent.
|
||||
uint32_t GetNumCols() const { return num_cols_; }
|
||||
@@ -124,6 +127,9 @@ private:
|
||||
core::MemoryRegion::AllocateFlags flags)>
|
||||
system_allocator_;
|
||||
|
||||
|
||||
std::function<void(void*)> system_deallocator_;
|
||||
|
||||
const hsa_profile_t profile_ = HSA_PROFILE_BASE;
|
||||
const uint32_t min_aql_size_ = 0x40;
|
||||
const uint32_t max_aql_size_ = 0x40;
|
||||
|
||||
@@ -49,7 +49,6 @@
|
||||
#include "core/inc/queue.h"
|
||||
#include "core/inc/runtime.h"
|
||||
#include "core/inc/signal.h"
|
||||
#include "core/util/locks.h"
|
||||
|
||||
namespace rocr {
|
||||
namespace AMD {
|
||||
@@ -131,6 +130,12 @@ private:
|
||||
/// @brief Base of the queue's ring buffer storage.
|
||||
void *ring_buf_ = nullptr;
|
||||
|
||||
/// @brief Called when the doorbell is rung to iterate over
|
||||
/// all packets and submit them. Submissions is done by
|
||||
// calling into the XdnaDriver.
|
||||
hsa_status_t SubmitCmd(XdnaDriver& driver, void* queue_base, uint64_t read_dispatch_id,
|
||||
uint64_t write_dispatch_id);
|
||||
|
||||
/// @brief Handle for an application context on the AIE device.
|
||||
///
|
||||
/// Each user queue will have an associated context. This handle is assigned
|
||||
@@ -154,4 +159,4 @@ private:
|
||||
} // namespace AMD
|
||||
} // namespace rocr
|
||||
|
||||
#endif // header guard
|
||||
#endif // HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_
|
||||
|
||||
@@ -45,9 +45,46 @@
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "core/driver/xdna/uapi/amdxdna_accel.h"
|
||||
#include "core/inc/amd_aie_agent.h"
|
||||
#include "core/inc/driver.h"
|
||||
#include "core/inc/memory_region.h"
|
||||
|
||||
/// @brief struct amdxdna_cmd_chain - Interpretation of data payload for
|
||||
/// ERT_CMD_CHAIN
|
||||
struct amdxdna_cmd_chain {
|
||||
/// Number of commands in chain
|
||||
__u32 command_count;
|
||||
/// Index of last successfully submitted command in chain
|
||||
__u32 submit_index;
|
||||
/// Index of failing command if cmd status is not completed
|
||||
__u32 error_index;
|
||||
__u32 reserved[3];
|
||||
/// Address of each command in chain
|
||||
__u64 data[] __counted_by(command_count);
|
||||
};
|
||||
|
||||
/// @brief struct amdxdna_cmd - Exec buffer command header format
|
||||
struct amdxdna_cmd {
|
||||
union {
|
||||
struct {
|
||||
/// Current state of a command
|
||||
__u32 state : 4;
|
||||
__u32 unused : 6;
|
||||
/// Extra CU masks in addition to mandatory mask
|
||||
__u32 extra_cu_masks : 2;
|
||||
/// Number of words in payload (data)
|
||||
__u32 count : 11;
|
||||
/// Opcode identifying specific command
|
||||
__u32 opcode : 5;
|
||||
__u32 reserved : 4;
|
||||
};
|
||||
__u32 header;
|
||||
};
|
||||
/// Count number of words representing packet payload
|
||||
__u32 data[] __counted_by(count);
|
||||
};
|
||||
|
||||
namespace rocr {
|
||||
namespace core {
|
||||
class Queue;
|
||||
@@ -55,6 +92,40 @@ class Queue;
|
||||
|
||||
namespace AMD {
|
||||
|
||||
/// @brief: The number of arguments in the packet payload before we start passing operands
|
||||
constexpr uint32_t NON_OPERAND_COUNT = 6;
|
||||
|
||||
// @brief: Used to transform an address into a device address
|
||||
constexpr uint32_t DEV_ADDR_BASE = 0x04000000;
|
||||
constexpr uint32_t DEV_ADDR_OFFSET_MASK = 0x02FFFFFF;
|
||||
|
||||
/// @brief: The driver places a structure before each command in a command chain.
|
||||
/// Need to increase the size of the command by the size of this structure.
|
||||
/// In the following xdna driver source can see where this is implemented:
|
||||
/// Commit hash: eddd92c0f61592c576a500f16efa24eb23667c23
|
||||
/// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_msg_priv.h#L387-L391
|
||||
/// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L637
|
||||
constexpr uint32_t CMD_COUNT_SIZE_INCREASE = 3;
|
||||
|
||||
/// @brief: The size of an instruction in bytes
|
||||
constexpr uint32_t INSTR_SIZE_BYTES = 4;
|
||||
|
||||
/// @brief: Index of command payload where the instruction sequence
|
||||
/// address is located
|
||||
constexpr uint32_t CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2;
|
||||
constexpr uint32_t CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_SIZE_IDX = 4;
|
||||
|
||||
/// @brief Environment variable to define job submission timeout
|
||||
constexpr uint32_t DEFAULT_TIMEOUT_VAL = 50;
|
||||
|
||||
/// @brief: Calculates the number of operands in a packet
|
||||
/// given the number of arguments in the packet
|
||||
/// @param: arg_count(Input), Number of arguments in the packet
|
||||
/// @return: uint32_t, The number of operands in the packet
|
||||
inline uint32_t GetOperandCount(uint32_t arg_count) {
|
||||
return ((arg_count - NON_OPERAND_COUNT) / 2);
|
||||
}
|
||||
|
||||
class XdnaDriver final : public core::Driver {
|
||||
public:
|
||||
XdnaDriver(std::string devnode_name);
|
||||
@@ -68,6 +139,9 @@ public:
|
||||
hsa_status_t Init() override;
|
||||
hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;
|
||||
|
||||
std::unordered_map<uint32_t, void*>& GetHandleMappings();
|
||||
std::unordered_map<void*, uint32_t>& GetAddrMappings();
|
||||
|
||||
hsa_status_t GetAgentProperties(core::Agent &agent) const override;
|
||||
hsa_status_t
|
||||
GetMemoryProperties(uint32_t node_id,
|
||||
@@ -84,7 +158,11 @@ public:
|
||||
hsa_status_t CreateQueue(core::Queue &queue) const override;
|
||||
hsa_status_t DestroyQueue(core::Queue &queue) const override;
|
||||
|
||||
private:
|
||||
// @brief Submits num_pkts packets in a command chain to the XDNA driver
|
||||
hsa_status_t SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uint32_t num_pkts,
|
||||
uint32_t num_operands, uint32_t hw_ctx_handle);
|
||||
|
||||
private:
|
||||
hsa_status_t QueryDriverVersion();
|
||||
/// @brief Allocate device accesible heap space.
|
||||
///
|
||||
@@ -92,27 +170,55 @@ private:
|
||||
hsa_status_t InitDeviceHeap();
|
||||
hsa_status_t FreeDeviceHeap();
|
||||
|
||||
/// @brief Creates a command BO and returns a pointer to the memory and
|
||||
// the corresponding handle
|
||||
///
|
||||
/// @param size size of memory to allocate
|
||||
/// @param handle A pointer to the BO handle
|
||||
/// @param cmd A pointer to the buffer
|
||||
hsa_status_t CreateCmd(uint32_t size, uint32_t* handle, amdxdna_cmd** cmd);
|
||||
|
||||
/// @brief Adds all BOs in a command packet payload to a vector
|
||||
/// and replaces the handles with a virtual address
|
||||
///
|
||||
/// @param count Number of entries in the command
|
||||
/// @param bo_args A pointer to a vector that contains all bo handles
|
||||
/// @param cmd_pkt_payload A pointer to the payload of the command
|
||||
hsa_status_t RegisterCmdBOs(uint32_t count, std::vector<uint32_t>& bo_args,
|
||||
std::vector<uint32_t>& bo_sizes, std::vector<uint64_t>& bo_addrs,
|
||||
hsa_amd_aie_ert_start_kernel_data_t* cmd_pkt_payload,
|
||||
const std::unordered_map<void*, uint32_t>& vmem_addr_mappings);
|
||||
|
||||
/// @brief Syncs all BOs referenced in bo_args
|
||||
///
|
||||
/// @param bo_args vector containing handles of BOs to sync
|
||||
hsa_status_t SyncBos(const std::vector<uint64_t>& bo_args, const std::vector<uint32_t>& bo_sizes);
|
||||
|
||||
/// @brief Executes a command and waits for its completion
|
||||
///
|
||||
/// @param exec_cmd Structure containing the details of the command to execute
|
||||
/// @param hw_ctx_handle the handle of the hardware context to run this
|
||||
/// command
|
||||
hsa_status_t ExecCmdAndWait(amdxdna_drm_exec_cmd* exec_cmd, uint32_t hw_ctx_handle);
|
||||
|
||||
/// TODO: Remove this in the future and rely on the core Runtime
|
||||
/// object to track handle allocations. Using the VMEM API for mapping XDNA
|
||||
/// driver handles requires a bit more refactoring. So rely on the XDNA driver
|
||||
/// to manage some of this for now.
|
||||
std::unordered_map<uint32_t, void *> vmem_handle_mappings;
|
||||
std::unordered_map<void*, uint32_t> vmem_addr_mappings;
|
||||
|
||||
/// @brief Virtual address range allocated for the device heap.
|
||||
///
|
||||
/// Allocate a large enough space so we can carve out the device heap in
|
||||
/// this range and ensure it is aligned to 64MB. Currently, AIE2 supports
|
||||
/// 48MB device heap and it must be aligned to 64MB.
|
||||
/// this range and ensure it is aligned to 64MB. Currently, npu1 supports
|
||||
/// 64MB device heap and it must be aligned to 64MB.
|
||||
void *dev_heap_parent = nullptr;
|
||||
|
||||
/// @brief The aligned device heap.
|
||||
void *dev_heap_aligned = nullptr;
|
||||
static constexpr size_t dev_heap_size = 48 * 1024 * 1024;
|
||||
static constexpr size_t dev_heap_size = 64 * 1024 * 1024;
|
||||
static constexpr size_t dev_heap_align = 64 * 1024 * 1024;
|
||||
|
||||
/// @brief DRM buffer object handle for the device heap. Assigned by the
|
||||
/// kernel-mode driver.
|
||||
uint32_t dev_heap_handle = 0;
|
||||
};
|
||||
|
||||
} // namespace AMD
|
||||
|
||||
@@ -322,6 +322,8 @@ void AieAgent::InitAllocators() {
|
||||
? mem
|
||||
: nullptr;
|
||||
};
|
||||
|
||||
system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
+65
-18
@@ -41,22 +41,19 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "core/inc/amd_aie_aql_queue.h"
|
||||
#include "core/inc/amd_xdna_driver.h"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <fcntl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <Windows.h>
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <thread>
|
||||
#include <cstring>
|
||||
|
||||
#include "core/inc/queue.h"
|
||||
#include "core/inc/runtime.h"
|
||||
@@ -104,7 +101,12 @@ AieAqlQueue::AieAqlQueue(AieAgent *agent, size_t req_size_pkts,
|
||||
drv.CreateQueue(*this);
|
||||
}
|
||||
|
||||
AieAqlQueue::~AieAqlQueue() { Inactivate(); }
|
||||
AieAqlQueue::~AieAqlQueue() {
|
||||
AieAqlQueue::Inactivate();
|
||||
if (ring_buf_) {
|
||||
agent_.system_deallocator()(ring_buf_);
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t AieAqlQueue::Inactivate() {
|
||||
bool active(active_.exchange(false, std::memory_order_relaxed));
|
||||
@@ -193,8 +195,54 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) {
|
||||
}
|
||||
|
||||
void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) {
|
||||
atomic::Store(signal_.hardware_doorbell_ptr, uint64_t(value),
|
||||
std::memory_order_release);
|
||||
auto& driver = static_cast<XdnaDriver&>(agent_.driver());
|
||||
SubmitCmd(driver, amd_queue_.hsa_queue.base_address, amd_queue_.read_dispatch_id,
|
||||
amd_queue_.write_dispatch_id);
|
||||
}
|
||||
|
||||
hsa_status_t AieAqlQueue::SubmitCmd(XdnaDriver& driver, void* queue_base, uint64_t read_dispatch_id,
|
||||
uint64_t write_dispatch_id) {
|
||||
uint64_t cur_id = read_dispatch_id;
|
||||
while (cur_id < write_dispatch_id) {
|
||||
hsa_amd_aie_ert_packet_t* pkt = static_cast<hsa_amd_aie_ert_packet_t*>(queue_base) + cur_id;
|
||||
|
||||
// Get the packet header information
|
||||
if (pkt->header.header != HSA_PACKET_TYPE_VENDOR_SPECIFIC ||
|
||||
pkt->header.AmdFormat != HSA_AMD_PACKET_TYPE_AIE_ERT)
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
// Get the payload information
|
||||
switch (pkt->opcode) {
|
||||
case HSA_AMD_AIE_ERT_START_CU: {
|
||||
// Iterating over future packets and seeing how many contiguous HSA_AMD_AIE_ERT_START_CU
|
||||
// packets there are. All can be combined into a single chain.
|
||||
int num_cont_start_cu_pkts = 1;
|
||||
int num_operands = 0;
|
||||
for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) {
|
||||
hsa_amd_aie_ert_packet_t* peak_pkt =
|
||||
static_cast<hsa_amd_aie_ert_packet_t*>(queue_base) + peak_pkt_id;
|
||||
if (peak_pkt->opcode != HSA_AMD_AIE_ERT_START_CU) {
|
||||
break;
|
||||
}
|
||||
num_operands += GetOperandCount(peak_pkt->count);
|
||||
num_cont_start_cu_pkts++;
|
||||
}
|
||||
|
||||
// Call into the driver to submit from cur_id to write_dispatch_id
|
||||
if (driver.SubmitCmdChain(pkt, num_cont_start_cu_pkts, num_operands, hw_ctx_handle_) !=
|
||||
HSA_STATUS_SUCCESS)
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
cur_id += num_cont_start_cu_pkts;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void AieAqlQueue::StoreRelease(hsa_signal_value_t value) {
|
||||
@@ -205,16 +253,15 @@ void AieAqlQueue::StoreRelease(hsa_signal_value_t value) {
|
||||
hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute,
|
||||
void *value) {
|
||||
switch (attribute) {
|
||||
case HSA_AMD_QUEUE_INFO_AGENT:
|
||||
*(reinterpret_cast<hsa_agent_t *>(value)) = agent_.public_handle();
|
||||
break;
|
||||
case HSA_AMD_QUEUE_INFO_DOORBELL_ID:
|
||||
// Hardware doorbell supports AQL semantics.
|
||||
*(reinterpret_cast<uint64_t *>(value)) =
|
||||
reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
|
||||
break;
|
||||
default:
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
case HSA_AMD_QUEUE_INFO_AGENT:
|
||||
*static_cast<hsa_agent_t*>(value) = agent_.public_handle();
|
||||
break;
|
||||
case HSA_AMD_QUEUE_INFO_DOORBELL_ID:
|
||||
// Hardware doorbell supports AQL semantics.
|
||||
*static_cast<uint64_t*>(value) = reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
|
||||
break;
|
||||
default:
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -348,6 +348,29 @@ static __forceinline std::string& rtrim(std::string& s) {
|
||||
|
||||
static __forceinline std::string& trim(std::string& s) { return ltrim(rtrim(s)); }
|
||||
|
||||
/// @brief: Flush the cachelines associated with the
|
||||
/// provided address, offset, and length
|
||||
/// @param: base(Input), base address to flush
|
||||
/// @param: offset(Input), offset of base address to flush
|
||||
/// @param: len(Input), length of buffer to flush
|
||||
inline void FlushCpuCache(const void* base, size_t offset, size_t len) {
|
||||
static long cacheline_size = 0;
|
||||
|
||||
if (!cacheline_size) {
|
||||
long sz = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
|
||||
if (sz <= 0) return;
|
||||
cacheline_size = sz;
|
||||
}
|
||||
|
||||
const char* cur = (const char*)base;
|
||||
cur += offset;
|
||||
uintptr_t lastline = (uintptr_t)(cur + len - 1) | (cacheline_size - 1);
|
||||
do {
|
||||
_mm_clflush((const void*)cur);
|
||||
cur += cacheline_size;
|
||||
} while (cur <= (const char*)lastline);
|
||||
}
|
||||
|
||||
} // namespace rocr
|
||||
|
||||
template <uint32_t lowBit, uint32_t highBit, typename T>
|
||||
@@ -394,6 +417,20 @@ inline uint32_t PtrHigh32(const void* p) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// @brief: Concatenates two numbers of type InType to a number of type OutType
|
||||
/// @param: hi(Input), To be placed in the upper bits of the output
|
||||
/// @param: lo(Input), To be placed in the lower bits of the output
|
||||
/// @return: OutType, Concatenation of hi and lo
|
||||
template <typename OutType, typename InType>
|
||||
typename std::enable_if<std::is_integral<OutType>::value && std::is_integral<InType>::value &&
|
||||
sizeof(OutType) >= 2 * sizeof(InType),
|
||||
OutType>::type
|
||||
Concat(InType hi, InType lo) {
|
||||
OutType res = ((static_cast<OutType>(hi) << sizeof(InType) * 8) | static_cast<OutType>(lo));
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
#include "atomic_helpers.h"
|
||||
|
||||
#endif // HSA_RUNTIME_CORE_UTIL_UTILS_H_
|
||||
|
||||
Ссылка в новой задаче
Block a user