rocr/aie: AIE Queue Processing

Change-Id: I681c971ba7229037ca85d5529838aa7bbe5820e2


[ROCm/ROCR-Runtime commit: e9cc839b2b]
Этот коммит содержится в:
Eddie Richter
2024-12-06 18:16:26 +00:00
коммит произвёл David Yat Sin
родитель a317650289
Коммит 8ea388af92
8 изменённых файлов: 538 добавлений и 141 удалений
+252 -23
Просмотреть файл
@@ -117,7 +117,9 @@ hsa_status_t XdnaDriver::GetAgentProperties(core::Agent &agent) const {
return HSA_STATUS_ERROR;
}
aie_agent.SetNumCols(aie_metadata.cols);
// Right now can only target N-1 columns as that is the
// number of shim DMAs in npu1 devices.
aie_agent.SetNumCols(aie_metadata.cols - 1);
aie_agent.SetNumCoreRows(aie_metadata.core.row_count);
return HSA_STATUS_SUCCESS;
@@ -147,7 +149,7 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
}
if (m_region.kernarg()) {
create_bo_args.type = AMDXDNA_BO_CMD;
create_bo_args.type = AMDXDNA_BO_SHMEM;
} else {
create_bo_args.type = AMDXDNA_BO_DEV;
}
@@ -189,11 +191,26 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region,
}
vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem);
vmem_addr_mappings.emplace(mapped_mem, create_bo_args.handle);
return HSA_STATUS_SUCCESS;
}
hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) {
auto it = vmem_addr_mappings.find(mem);
if (it == vmem_addr_mappings.end()) return HSA_STATUS_ERROR_INVALID_ALLOCATION;
auto handle = it->second;
drm_gem_close close_args = {};
close_args.handle = handle;
if (ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_args) < 0) {
return HSA_STATUS_ERROR;
}
vmem_handle_mappings.erase(handle);
vmem_addr_mappings.erase(it);
return HSA_STATUS_SUCCESS;
}
@@ -207,18 +224,10 @@ hsa_status_t XdnaDriver::CreateQueue(core::Queue &queue) const {
// Currently we do not leverage QoS information.
amdxdna_qos_info qos_info{0};
amdxdna_drm_create_hwctx create_hwctx_args{
.ext = 0,
.ext_flags = 0,
.qos_p = reinterpret_cast<uintptr_t>(&qos_info),
.umq_bo = 0,
.log_buf_bo = 0,
// TODO: Make this configurable.
.max_opc = 0x800,
// This field is for the number of core tiles.
.num_tiles = aie_agent.GetNumCores(),
.mem_size = 0,
.umq_doorbell = 0};
amdxdna_drm_create_hwctx create_hwctx_args = {};
create_hwctx_args.qos_p = reinterpret_cast<uintptr_t>(&qos_info);
create_hwctx_args.max_opc = 0x800;
create_hwctx_args.num_tiles = static_cast<uint32_t>(aie_agent.GetNumCores());
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &create_hwctx_args) < 0) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
@@ -261,13 +270,9 @@ hsa_status_t XdnaDriver::QueryDriverVersion() {
}
hsa_status_t XdnaDriver::InitDeviceHeap() {
amdxdna_drm_create_bo create_bo_args{
.flags = 0,
.type = AMDXDNA_BO_DEV_HEAP,
._pad = 0,
.vaddr = reinterpret_cast<uintptr_t>(nullptr),
.size = dev_heap_size,
.handle = 0};
amdxdna_drm_create_bo create_bo_args = {};
create_bo_args.size = dev_heap_size;
create_bo_args.type = AMDXDNA_BO_DEV_HEAP;
amdxdna_drm_get_bo_info get_bo_info_args{0};
drm_gem_close close_bo_args{0};
@@ -316,19 +321,243 @@ hsa_status_t XdnaDriver::InitDeviceHeap() {
return HSA_STATUS_SUCCESS;
}
std::unordered_map<uint32_t, void*>& XdnaDriver::GetHandleMappings() {
return vmem_handle_mappings;
}
std::unordered_map<void*, uint32_t>& XdnaDriver::GetAddrMappings() { return vmem_addr_mappings; }
hsa_status_t XdnaDriver::FreeDeviceHeap() {
if (dev_heap_parent) {
munmap(dev_heap_parent, dev_heap_align * 2 - 1);
if (munmap(dev_heap_parent, dev_heap_align * 2 - 1) != 0) return HSA_STATUS_ERROR;
dev_heap_parent = nullptr;
}
if (dev_heap_aligned) {
munmap(dev_heap_aligned, dev_heap_size);
if (munmap(dev_heap_aligned, dev_heap_size) != 0) return HSA_STATUS_ERROR;
dev_heap_aligned = nullptr;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t XdnaDriver::SyncBos(const std::vector<uint64_t>& bo_addrs,
const std::vector<uint32_t>& bo_sizes) {
if (bo_addrs.size() != bo_sizes.size()) return HSA_STATUS_ERROR;
for (int i = 0; i < bo_addrs.size(); i++) {
FlushCpuCache(reinterpret_cast<void*>(bo_addrs[i]), 0, bo_sizes[i]);
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t XdnaDriver::ExecCmdAndWait(amdxdna_drm_exec_cmd* exec_cmd, uint32_t hw_ctx_handle) {
// Submit the cmd
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_EXEC_CMD, exec_cmd)) return HSA_STATUS_ERROR;
// Waiting for command to finish
amdxdna_drm_wait_cmd wait_cmd = {};
wait_cmd.hwctx = hw_ctx_handle;
wait_cmd.timeout = DEFAULT_TIMEOUT_VAL;
wait_cmd.seq = exec_cmd->seq;
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd)) return HSA_STATUS_ERROR;
return HSA_STATUS_SUCCESS;
}
hsa_status_t XdnaDriver::RegisterCmdBOs(
uint32_t count, std::vector<uint32_t>& bo_args, std::vector<uint32_t>& bo_sizes,
std::vector<uint64_t>& bo_addrs, hsa_amd_aie_ert_start_kernel_data_t* cmd_pkt_payload,
const std::unordered_map<void*, uint32_t>& vmem_addr_mappings) {
// This is the index where the operand addresses start in a command
const int operand_starting_index = 5;
// Counting the number of operands in the command payload.
uint32_t num_operands = GetOperandCount(count);
uint64_t instr_addr = Concat<uint64_t, uint32_t>(
cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX + 1],
cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]);
auto instr_handle = vmem_addr_mappings.find(reinterpret_cast<void*>(instr_addr));
if (instr_handle == vmem_addr_mappings.end()) return HSA_STATUS_ERROR;
// Keep track of the handles and addresses before we submit the packet
bo_args.push_back(instr_handle->second);
bo_addrs.push_back(instr_addr);
// Adding the instruction sequence size. The packet contains the number of
// instructions.
uint32_t instr_bo_size =
cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_SIZE_IDX] * INSTR_SIZE_BYTES;
bo_sizes.push_back(instr_bo_size);
// Going through all of the operands in the command, keeping track of the
// addresses and turning the addresses into handles. The starting index of
// the operands in a command is `operand_starting_index` and the fields
// are 32-bits we need to iterate over every two
for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) {
uint32_t operand_index = operand_starting_index + 2 * operand_iter;
uint64_t operand_addr = Concat<uint64_t, uint32_t>(cmd_pkt_payload->data[operand_index + 1],
cmd_pkt_payload->data[operand_index]);
auto operand_handle = vmem_addr_mappings.find(reinterpret_cast<void*>(operand_addr));
if (operand_handle == vmem_addr_mappings.end()) return HSA_STATUS_ERROR;
bo_args.push_back(operand_handle->second);
bo_addrs.push_back(operand_addr);
}
// Going through all of the operands in the command, keeping track of
// the sizes of each operand. The size is used to sync the buffer
uint32_t operand_size_starting_index = operand_starting_index + 2 * num_operands;
for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) {
bo_sizes.push_back(cmd_pkt_payload->data[operand_size_starting_index + operand_iter]);
}
// Transform the instruction sequence address into device address
cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX] =
DEV_ADDR_BASE | instr_addr & DEV_ADDR_OFFSET_MASK;
return HSA_STATUS_SUCCESS;
}
hsa_status_t XdnaDriver::CreateCmd(uint32_t size, uint32_t* handle, amdxdna_cmd** cmd) {
// Creating the command
amdxdna_drm_create_bo create_cmd_bo = {};
create_cmd_bo.type = AMDXDNA_BO_CMD, create_cmd_bo.size = size;
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo)) return HSA_STATUS_ERROR;
amdxdna_drm_get_bo_info cmd_bo_get_bo_info = {};
cmd_bo_get_bo_info.handle = create_cmd_bo.handle;
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_get_bo_info)) return HSA_STATUS_ERROR;
*cmd = static_cast<amdxdna_cmd*>(mmap(nullptr, create_cmd_bo.size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd_, cmd_bo_get_bo_info.map_offset));
if (cmd == MAP_FAILED) return HSA_STATUS_ERROR;
*handle = create_cmd_bo.handle;
return HSA_STATUS_SUCCESS;
}
hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uint32_t num_pkts,
uint32_t num_operands, uint32_t hw_ctx_handle) {
// Storing the metadata of the BOs that store the operands and metadata
// of the commands we are going to submit
std::vector<uint32_t> bo_args;
std::vector<uint32_t> bo_sizes;
std::vector<uint64_t> bo_addrs;
bo_args.reserve(num_operands);
bo_sizes.reserve(num_operands);
bo_addrs.reserve(num_operands);
// Storing the commands that we are going to submit and the
// corresponding metadata
std::vector<uint32_t> cmd_handles;
std::vector<uint32_t> cmd_sizes;
std::vector<amdxdna_cmd*> cmds;
cmd_handles.reserve(num_pkts);
cmd_sizes.reserve(num_pkts);
cmds.reserve(num_pkts);
// Iterating over all the contiguous HSA_AMD_AIE_ERT_CMD_CHAIN packets
for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
// Getting the current command packet
hsa_amd_aie_ert_packet_t* pkt = first_pkt + pkt_iter;
hsa_amd_aie_ert_start_kernel_data_t* cmd_pkt_payload =
reinterpret_cast<hsa_amd_aie_ert_start_kernel_data_t*>(pkt->payload_data);
// Add the handles for all of the BOs to bo_args as well as rewrite
// the command payload handles to contain the actual virtual addresses
if (RegisterCmdBOs(pkt->count, bo_args, bo_sizes, bo_addrs, cmd_pkt_payload,
vmem_addr_mappings) != HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
// Creating a packet that contains the command to execute the kernel
uint32_t cmd_bo_handle = 0;
amdxdna_cmd* cmd = nullptr;
uint32_t cmd_size = sizeof(amdxdna_cmd) + pkt->count * sizeof(uint32_t);
if (CreateCmd(cmd_size, &cmd_bo_handle, &cmd)) return HSA_STATUS_ERROR;
// Filling in the fields of the command
cmd->state = pkt->state;
cmd->extra_cu_masks = 0;
// The driver places a structure before each command in a command chain.
// Need to increase the size of the command by the size of this structure.
cmd->count = pkt->count + CMD_COUNT_SIZE_INCREASE;
cmd->opcode = pkt->opcode;
cmd->data[0] = cmd_pkt_payload->cu_mask;
memcpy((cmd->data + 1), cmd_pkt_payload->data, 4 * pkt->count);
// Keeping track of the handle
cmd_handles.push_back(cmd_bo_handle);
cmds.push_back(cmd);
cmd_sizes.push_back(cmd_size);
}
// Creating a packet that contains the command chain
uint32_t cmd_chain_bo_handle = 0;
amdxdna_cmd* cmd_chain = nullptr;
int cmd_chain_size = (cmd_handles.size() + 1) * sizeof(uint32_t);
if (CreateCmd(cmd_chain_size, &cmd_chain_bo_handle, &cmd_chain)) return HSA_STATUS_ERROR;
// Writing information to the command buffer
amdxdna_cmd_chain* cmd_chain_payload = reinterpret_cast<amdxdna_cmd_chain*>(cmd_chain->data);
// Creating a command chain
cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_chain->extra_cu_masks = 0;
cmd_chain->count = sizeof(amdxdna_cmd_chain) + cmd_handles.size() * sizeof(uint64_t);
cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN;
cmd_chain_payload->command_count = cmd_handles.size();
cmd_chain_payload->submit_index = 0;
cmd_chain_payload->error_index = 0;
for (int i = 0; i < cmd_handles.size(); i++) {
cmd_chain_payload->data[i] = cmd_handles[i];
}
// Syncing BOs before we execute the command
if (SyncBos(bo_addrs, bo_sizes)) return HSA_STATUS_ERROR;
// Removing duplicates in the bo container. The driver will report
// an error if we provide the same BO handle multiple times.
// This can happen if any of the BOs are the same across jobs
std::sort(bo_args.begin(), bo_args.end());
bo_args.erase(std::unique(bo_args.begin(), bo_args.end()), bo_args.end());
// Filling in the fields to execute the command chain
amdxdna_drm_exec_cmd exec_cmd_0 = {};
exec_cmd_0.hwctx = hw_ctx_handle;
exec_cmd_0.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF;
exec_cmd_0.cmd_handles = cmd_chain_bo_handle;
exec_cmd_0.args = reinterpret_cast<uint64_t>(bo_args.data());
exec_cmd_0.cmd_count = 1;
exec_cmd_0.arg_count = bo_args.size();
// Executing all commands in the command chain
ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle);
// Unmapping and closing the cmd BOs
drm_gem_close close_bo_args{0};
for (int i = 0; i < cmd_handles.size(); i++) {
if (munmap(cmds[i], cmd_sizes[i]) != 0) return HSA_STATUS_ERROR;
close_bo_args.handle = cmd_handles[i];
ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
}
// Unmapping and closing the cmd_chain BO
if (munmap(cmd_chain, cmd_chain_size) != 0) return HSA_STATUS_ERROR;
close_bo_args.handle = cmd_chain_bo_handle;
ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args);
// Syncing BOs after we execute the command
if (SyncBos(bo_addrs, bo_sizes)) return HSA_STATUS_ERROR;
return HSA_STATUS_SUCCESS;
}
} // namespace AMD
} // namespace rocr
+55 -90
Просмотреть файл
@@ -21,7 +21,6 @@ extern "C" {
#define AMDXDNA_DRIVER_MAJOR 1
#define AMDXDNA_DRIVER_MINOR 0
#define AMDXDNA_INVALID_CMD_HANDLE (~0UL)
#define AMDXDNA_INVALID_ADDR (~0UL)
#define AMDXDNA_INVALID_CTX_HANDLE 0
#define AMDXDNA_INVALID_BO_HANDLE 0
@@ -50,11 +49,9 @@ enum amdxdna_drm_ioctl_id {
DRM_AMDXDNA_GET_BO_INFO,
DRM_AMDXDNA_SYNC_BO,
DRM_AMDXDNA_EXEC_CMD,
DRM_AMDXDNA_WAIT_CMD,
DRM_AMDXDNA_GET_INFO,
DRM_AMDXDNA_SET_STATE,
DRM_AMDXDNA_SUBMIT_WAIT,
DRM_AMDXDNA_SUBMIT_SIGNAL,
DRM_AMDXDNA_WAIT_CMD,
DRM_AMDXDNA_NUM_IOCTLS
};
@@ -96,6 +93,7 @@ struct amdxdna_qos_info {
* @mem_size: Size of AIE tile memory.
* @umq_doorbell: Returned offset of doorbell associated with UMQ.
* @handle: Returned hardware context handle.
* @pad: Structure padding.
*/
struct amdxdna_drm_create_hwctx {
__u64 ext;
@@ -108,12 +106,13 @@ struct amdxdna_drm_create_hwctx {
__u32 mem_size;
__u32 umq_doorbell;
__u32 handle;
__u32 pad;
};
/**
* struct amdxdna_drm_destroy_hwctx - Destroy hardware context.
* @handle: Hardware context handle.
* @pad: MBZ.
* @pad: Structure padding.
*/
struct amdxdna_drm_destroy_hwctx {
__u32 handle;
@@ -122,9 +121,9 @@ struct amdxdna_drm_destroy_hwctx {
/**
* struct amdxdna_cu_config - configuration for one CU
* @cu_bo: CU configuration buffer bo handle
* @cu_func: Functional of a CU
* @pad: MBZ
* @cu_bo: CU configuration buffer bo handle.
* @cu_func: Function of a CU.
* @pad: Structure padding.
*/
struct amdxdna_cu_config {
__u32 cu_bo;
@@ -135,9 +134,9 @@ struct amdxdna_cu_config {
/**
* struct amdxdna_hwctx_param_config_cu - configuration for CUs in hardware
* context
* @num_cus: Number of CUs to configure
* @pad: MBZ
* @cu_configs: Array of CU configurations of struct amdxdna_cu_config
* @num_cus: Number of CUs to configure.
* @pad: Structure padding.
* @cu_configs: Array of CU configurations of struct amdxdna_cu_config.
*/
struct amdxdna_hwctx_param_config_cu {
__u16 num_cus;
@@ -160,6 +159,7 @@ enum amdxdna_drm_config_hwctx_param {
* @param_val: A structure specified by the param_type struct member.
* @param_val_size: Size of the parameter buffer pointed to by the param_val.
* If param_val is not a pointer, driver can ignore this.
* @pad: Structure padding.
*
* Note: if the param_val is a pointer pointing to a buffer, the maximum size
* of the buffer is 4KiB(PAGE_SIZE).
@@ -191,17 +191,16 @@ enum amdxdna_bo_type {
/**
* struct amdxdna_drm_create_bo - Create a buffer object.
* @flags: Buffer flags. MBZ.
* @type: Buffer type.
* @vaddr: User VA of buffer if applied. MBZ.
* @size: Size in bytes.
* @type: Buffer type.
* @handle: Returned DRM buffer object handle.
*/
struct amdxdna_drm_create_bo {
__u64 flags;
__u32 type;
__u32 _pad;
__u64 vaddr;
__u64 size;
__u32 type;
__u32 handle;
};
@@ -210,6 +209,7 @@ struct amdxdna_drm_create_bo {
* @ext: MBZ.
* @ext_flags: MBZ.
* @handle: DRM buffer object handle.
* @pad: Structure padding.
* @map_offset: Returned DRM fake offset for mmap().
* @vaddr: Returned user VA of buffer. 0 in case user needs mmap().
* @xdna_addr: Returned XDNA device virtual address.
@@ -218,7 +218,7 @@ struct amdxdna_drm_get_bo_info {
__u64 ext;
__u64 ext_flags;
__u32 handle;
__u32 _pad;
__u32 pad;
__u64 map_offset;
__u64 vaddr;
__u64 xdna_addr;
@@ -252,8 +252,8 @@ enum amdxdna_cmd_type {
* @ext_flags: MBZ.
* @hwctx: Hardware context handle.
* @type: One of command type in enum amdxdna_cmd_type.
* @cmd_handles: Array of command handles or the command handle itself in case
* of just one.
* @cmd_handles: Array of command handles or the command handle itself
* in case of just one.
* @args: Array of arguments for all command handles.
* @cmd_count: Number of command handles in the cmd_handles array.
* @arg_count: Number of arguments in the args array.
@@ -279,8 +279,6 @@ struct amdxdna_drm_exec_cmd {
* @seq: sequence number of the command returned by execute command.
*
* Wait a command specified by seq to be completed.
* Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot
* to submit a new command.
*/
struct amdxdna_drm_wait_cmd {
__u32 hwctx;
@@ -290,10 +288,9 @@ struct amdxdna_drm_wait_cmd {
/**
* struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware
* @buffer: The user space buffer that will return the AIE status
* @buffer_size: The size of the user space buffer
* @cols_filled: A bitmap of AIE columns whose data has been returned in the
* buffer.
* @buffer: The user space buffer that will return the AIE status.
* @buffer_size: The size of the user space buffer.
* @cols_filled: A bitmap of AIE columns whose data has been returned in the buffer.
*/
struct amdxdna_drm_query_aie_status {
__u64 buffer; /* out */
@@ -303,8 +300,8 @@ struct amdxdna_drm_query_aie_status {
/**
* struct amdxdna_drm_query_aie_version - Query the version of the AIE hardware
* @major: The major version number
* @minor: The minor version number
* @major: The major version number.
* @minor: The minor version number.
*/
struct amdxdna_drm_query_aie_version {
__u32 major; /* out */
@@ -319,7 +316,7 @@ struct amdxdna_drm_query_aie_version {
* @dma_channel_count: The number of dma channels.
* @lock_count: The number of locks.
* @event_reg_count: The number of events.
* @pad: MBZ.
* @pad: Structure padding.
*/
struct amdxdna_drm_query_aie_tile_metadata {
__u16 row_count;
@@ -331,8 +328,7 @@ struct amdxdna_drm_query_aie_tile_metadata {
};
/**
* struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE
* hardware
* struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE hardware
* @col_size: The size of a column in bytes.
* @cols: The total number of columns.
* @rows: The total number of rows.
@@ -355,7 +351,7 @@ struct amdxdna_drm_query_aie_metadata {
* struct amdxdna_drm_query_clock - Metadata for a clock
* @name: The clock name.
* @freq_mhz: The clock frequency.
* @pad: MBZ.
* @pad: Structure padding.
*/
struct amdxdna_drm_query_clock {
__u8 name[16];
@@ -381,14 +377,12 @@ enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER };
* @input: The current value of the sensor.
* @max: The maximum value possible for the sensor.
* @average: The average value of the sensor.
* @highest: The highest recorded sensor value for this driver load for the
* sensor.
* @highest: The highest recorded sensor value for this driver load for the sensor.
* @status: The sensor status.
* @units: The sensor units.
* @unitm: Translates value member variables into the correct unit via (pow(10,
* unitm) * value)
* @type: The sensor type from enum amdxdna_sensor_type
* @pad: MBZ.
* @unitm: Translates value member variables into the correct unit via (pow(10, unitm) * value).
* @type: The sensor type from enum amdxdna_sensor_type.
* @pad: Structure padding.
*/
struct amdxdna_drm_query_sensor {
__u8 label[64];
@@ -408,14 +402,14 @@ struct amdxdna_drm_query_sensor {
* @context_id: The ID for this context.
* @start_col: The starting column for the partition assigned to this context.
* @num_col: The number of columns in the partition assigned to this context.
* @pad: Structure padding.
* @pid: The Process ID of the process that created this context.
* @command_submissions: The number of commands submitted to this context.
* @command_completions: The number of commands completed by this context.
* @migrations: The number of times this context has been moved to a different
* partition.
* @preemptions: The number of times this context has been preempted by another
* context in the same partition.
* @pad: MBZ.
* @migrations: The number of times this context has been moved to a different partition.
* @preemptions: The number of times this context has been preempted by another context in the
* same partition.
* @errors: The errors for this context.
*/
struct amdxdna_drm_query_hwctx {
__u32 context_id;
@@ -471,6 +465,7 @@ enum amdxdna_power_mode_type {
POWER_MODE_LOW, /**< Set frequency to lowest DPM */
POWER_MODE_MEDIUM, /**< Set frequency to medium DPM */
POWER_MODE_HIGH, /**< Set frequency to highest DPM */
POWER_MODE_TURBO, /**< More power, more performance */
};
/**
@@ -508,13 +503,13 @@ enum amdxdna_drm_get_param {
DRM_AMDXDNA_READ_AIE_REG,
DRM_AMDXDNA_QUERY_FIRMWARE_VERSION,
DRM_AMDXDNA_GET_POWER_MODE,
DRM_AMDXDNA_QUERY_TELEMETRY,
DRM_AMDXDNA_NUM_GET_PARAM,
};
/**
* struct amdxdna_drm_get_info - Get some information from the AIE hardware.
* @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed
* in the buffer.
* @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed in the buffer.
* @buffer_size: Size of the input buffer. Size needed/written by the kernel.
* @buffer: A structure specified by the param struct member.
*/
@@ -542,10 +537,8 @@ enum amdxdna_drm_set_param {
};
/**
* struct amdxdna_drm_set_state - Set the state of some component within the AIE
* hardware.
* @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed
* in the buffer.
* struct amdxdna_drm_set_state - Set the state of some component within the AIE hardware.
* @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed in the buffer.
* @buffer_size: Size of the input buffer.
* @buffer: A structure specified by the param struct member.
*/
@@ -555,63 +548,35 @@ struct amdxdna_drm_set_state {
__u64 buffer; /* in */
};
/**
* struct amdxdna_drm_syncobjs - Signal or wait on array of DRM timelined sync
* objects.
* @handles: Array of handles of sync objects.
* @points: Array of time points for each sync objects.
* @count: Number of elements in the above array.
*/
struct amdxdna_drm_syncobjs {
__u64 handles; /* in */
__u64 points; /* in */
__u32 count; /* in */
__u32 pad;
};
#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, struct amdxdna_drm_create_hwctx)
#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \
struct amdxdna_drm_create_hwctx)
#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, struct amdxdna_drm_destroy_hwctx)
#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \
struct amdxdna_drm_destroy_hwctx)
#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, struct amdxdna_drm_config_hwctx)
#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \
struct amdxdna_drm_config_hwctx)
#define DRM_IOCTL_AMDXDNA_CREATE_BO \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, struct amdxdna_drm_create_bo)
#define DRM_IOCTL_AMDXDNA_CREATE_BO \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \
struct amdxdna_drm_create_bo)
#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, struct amdxdna_drm_get_bo_info)
#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \
struct amdxdna_drm_get_bo_info)
#define DRM_IOCTL_AMDXDNA_SYNC_BO \
#define DRM_IOCTL_AMDXDNA_SYNC_BO \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, struct amdxdna_drm_sync_bo)
#define DRM_IOCTL_AMDXDNA_EXEC_CMD \
#define DRM_IOCTL_AMDXDNA_EXEC_CMD \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, struct amdxdna_drm_exec_cmd)
#define DRM_IOCTL_AMDXDNA_WAIT_CMD \
#define DRM_IOCTL_AMDXDNA_WAIT_CMD \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, struct amdxdna_drm_wait_cmd)
#define DRM_IOCTL_AMDXDNA_GET_INFO \
#define DRM_IOCTL_AMDXDNA_GET_INFO \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, struct amdxdna_drm_get_info)
#define DRM_IOCTL_AMDXDNA_SET_STATE \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \
struct amdxdna_drm_set_state)
#define DRM_IOCTL_AMDXDNA_SUBMIT_WAIT \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_WAIT, \
struct amdxdna_drm_syncobjs)
#define DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_SIGNAL, \
struct amdxdna_drm_syncobjs)
#define DRM_IOCTL_AMDXDNA_SET_STATE \
DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, struct amdxdna_drm_set_state)
#if defined(__cplusplus)
} /* extern c end */
+6
Просмотреть файл
@@ -100,6 +100,9 @@ public:
return system_allocator_;
}
/// @brief Getter for the AIE system deallocator.
const std::function<void(void*)>& system_deallocator() const { return system_deallocator_; }
// AIE agent methods.
/// @brief Get the number of columns on this AIE agent.
uint32_t GetNumCols() const { return num_cols_; }
@@ -124,6 +127,9 @@ private:
core::MemoryRegion::AllocateFlags flags)>
system_allocator_;
std::function<void(void*)> system_deallocator_;
const hsa_profile_t profile_ = HSA_PROFILE_BASE;
const uint32_t min_aql_size_ = 0x40;
const uint32_t max_aql_size_ = 0x40;
+7 -2
Просмотреть файл
@@ -49,7 +49,6 @@
#include "core/inc/queue.h"
#include "core/inc/runtime.h"
#include "core/inc/signal.h"
#include "core/util/locks.h"
namespace rocr {
namespace AMD {
@@ -131,6 +130,12 @@ private:
/// @brief Base of the queue's ring buffer storage.
void *ring_buf_ = nullptr;
/// @brief Called when the doorbell is rung to iterate over
/// all packets and submit them. Submissions is done by
// calling into the XdnaDriver.
hsa_status_t SubmitCmd(XdnaDriver& driver, void* queue_base, uint64_t read_dispatch_id,
uint64_t write_dispatch_id);
/// @brief Handle for an application context on the AIE device.
///
/// Each user queue will have an associated context. This handle is assigned
@@ -154,4 +159,4 @@ private:
} // namespace AMD
} // namespace rocr
#endif // header guard
#endif // HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_
+114 -8
Просмотреть файл
@@ -45,9 +45,46 @@
#include <memory>
#include <unordered_map>
#include "core/driver/xdna/uapi/amdxdna_accel.h"
#include "core/inc/amd_aie_agent.h"
#include "core/inc/driver.h"
#include "core/inc/memory_region.h"
/// @brief struct amdxdna_cmd_chain - Interpretation of data payload for
/// ERT_CMD_CHAIN
struct amdxdna_cmd_chain {
/// Number of commands in chain
__u32 command_count;
/// Index of last successfully submitted command in chain
__u32 submit_index;
/// Index of failing command if cmd status is not completed
__u32 error_index;
__u32 reserved[3];
/// Address of each command in chain
__u64 data[] __counted_by(command_count);
};
/// @brief struct amdxdna_cmd - Exec buffer command header format
struct amdxdna_cmd {
union {
struct {
/// Current state of a command
__u32 state : 4;
__u32 unused : 6;
/// Extra CU masks in addition to mandatory mask
__u32 extra_cu_masks : 2;
/// Number of words in payload (data)
__u32 count : 11;
/// Opcode identifying specific command
__u32 opcode : 5;
__u32 reserved : 4;
};
__u32 header;
};
/// Count number of words representing packet payload
__u32 data[] __counted_by(count);
};
namespace rocr {
namespace core {
class Queue;
@@ -55,6 +92,40 @@ class Queue;
namespace AMD {
/// @brief: The number of arguments in the packet payload before we start passing operands
constexpr uint32_t NON_OPERAND_COUNT = 6;
// @brief: Used to transform an address into a device address
constexpr uint32_t DEV_ADDR_BASE = 0x04000000;
constexpr uint32_t DEV_ADDR_OFFSET_MASK = 0x02FFFFFF;
/// @brief: The driver places a structure before each command in a command chain.
/// Need to increase the size of the command by the size of this structure.
/// In the following xdna driver source can see where this is implemented:
/// Commit hash: eddd92c0f61592c576a500f16efa24eb23667c23
/// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_msg_priv.h#L387-L391
/// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L637
constexpr uint32_t CMD_COUNT_SIZE_INCREASE = 3;
/// @brief: The size of an instruction in bytes
constexpr uint32_t INSTR_SIZE_BYTES = 4;
/// @brief: Index of command payload where the instruction sequence
/// address is located
constexpr uint32_t CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2;
constexpr uint32_t CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_SIZE_IDX = 4;
/// @brief Environment variable to define job submission timeout
constexpr uint32_t DEFAULT_TIMEOUT_VAL = 50;
/// @brief: Calculates the number of operands in a packet
/// given the number of arguments in the packet
/// @param: arg_count(Input), Number of arguments in the packet
/// @return: uint32_t, The number of operands in the packet
inline uint32_t GetOperandCount(uint32_t arg_count) {
return ((arg_count - NON_OPERAND_COUNT) / 2);
}
class XdnaDriver final : public core::Driver {
public:
XdnaDriver(std::string devnode_name);
@@ -68,6 +139,9 @@ public:
hsa_status_t Init() override;
hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;
std::unordered_map<uint32_t, void*>& GetHandleMappings();
std::unordered_map<void*, uint32_t>& GetAddrMappings();
hsa_status_t GetAgentProperties(core::Agent &agent) const override;
hsa_status_t
GetMemoryProperties(uint32_t node_id,
@@ -84,7 +158,11 @@ public:
hsa_status_t CreateQueue(core::Queue &queue) const override;
hsa_status_t DestroyQueue(core::Queue &queue) const override;
private:
// @brief Submits num_pkts packets in a command chain to the XDNA driver
hsa_status_t SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uint32_t num_pkts,
uint32_t num_operands, uint32_t hw_ctx_handle);
private:
hsa_status_t QueryDriverVersion();
/// @brief Allocate device accesible heap space.
///
@@ -92,27 +170,55 @@ private:
hsa_status_t InitDeviceHeap();
hsa_status_t FreeDeviceHeap();
/// @brief Creates a command BO and returns a pointer to the memory and
// the corresponding handle
///
/// @param size size of memory to allocate
/// @param handle A pointer to the BO handle
/// @param cmd A pointer to the buffer
hsa_status_t CreateCmd(uint32_t size, uint32_t* handle, amdxdna_cmd** cmd);
/// @brief Adds all BOs in a command packet payload to a vector
/// and replaces the handles with a virtual address
///
/// @param count Number of entries in the command
/// @param bo_args A pointer to a vector that contains all bo handles
/// @param cmd_pkt_payload A pointer to the payload of the command
hsa_status_t RegisterCmdBOs(uint32_t count, std::vector<uint32_t>& bo_args,
std::vector<uint32_t>& bo_sizes, std::vector<uint64_t>& bo_addrs,
hsa_amd_aie_ert_start_kernel_data_t* cmd_pkt_payload,
const std::unordered_map<void*, uint32_t>& vmem_addr_mappings);
/// @brief Syncs all BOs referenced in bo_args
///
/// @param bo_args vector containing handles of BOs to sync
hsa_status_t SyncBos(const std::vector<uint64_t>& bo_args, const std::vector<uint32_t>& bo_sizes);
/// @brief Executes a command and waits for its completion
///
/// @param exec_cmd Structure containing the details of the command to execute
/// @param hw_ctx_handle the handle of the hardware context to run this
/// command
hsa_status_t ExecCmdAndWait(amdxdna_drm_exec_cmd* exec_cmd, uint32_t hw_ctx_handle);
/// TODO: Remove this in the future and rely on the core Runtime
/// object to track handle allocations. Using the VMEM API for mapping XDNA
/// driver handles requires a bit more refactoring. So rely on the XDNA driver
/// to manage some of this for now.
std::unordered_map<uint32_t, void *> vmem_handle_mappings;
std::unordered_map<void*, uint32_t> vmem_addr_mappings;
/// @brief Virtual address range allocated for the device heap.
///
/// Allocate a large enough space so we can carve out the device heap in
/// this range and ensure it is aligned to 64MB. Currently, AIE2 supports
/// 48MB device heap and it must be aligned to 64MB.
/// this range and ensure it is aligned to 64MB. Currently, npu1 supports
/// 64MB device heap and it must be aligned to 64MB.
void *dev_heap_parent = nullptr;
/// @brief The aligned device heap.
void *dev_heap_aligned = nullptr;
static constexpr size_t dev_heap_size = 48 * 1024 * 1024;
static constexpr size_t dev_heap_size = 64 * 1024 * 1024;
static constexpr size_t dev_heap_align = 64 * 1024 * 1024;
/// @brief DRM buffer object handle for the device heap. Assigned by the
/// kernel-mode driver.
uint32_t dev_heap_handle = 0;
};
} // namespace AMD
+2
Просмотреть файл
@@ -322,6 +322,8 @@ void AieAgent::InitAllocators() {
? mem
: nullptr;
};
system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); };
break;
}
}
+65 -18
Просмотреть файл
@@ -41,22 +41,19 @@
////////////////////////////////////////////////////////////////////////////////
#include "core/inc/amd_aie_aql_queue.h"
#include "core/inc/amd_xdna_driver.h"
#ifdef __linux__
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif
#ifdef _WIN32
#include <Windows.h>
#endif
#include <stdio.h>
#include <string.h>
#include <thread>
#include <cstring>
#include "core/inc/queue.h"
#include "core/inc/runtime.h"
@@ -104,7 +101,12 @@ AieAqlQueue::AieAqlQueue(AieAgent *agent, size_t req_size_pkts,
drv.CreateQueue(*this);
}
AieAqlQueue::~AieAqlQueue() { Inactivate(); }
AieAqlQueue::~AieAqlQueue() {
AieAqlQueue::Inactivate();
if (ring_buf_) {
agent_.system_deallocator()(ring_buf_);
}
}
hsa_status_t AieAqlQueue::Inactivate() {
bool active(active_.exchange(false, std::memory_order_relaxed));
@@ -193,8 +195,54 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) {
}
void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) {
atomic::Store(signal_.hardware_doorbell_ptr, uint64_t(value),
std::memory_order_release);
auto& driver = static_cast<XdnaDriver&>(agent_.driver());
SubmitCmd(driver, amd_queue_.hsa_queue.base_address, amd_queue_.read_dispatch_id,
amd_queue_.write_dispatch_id);
}
hsa_status_t AieAqlQueue::SubmitCmd(XdnaDriver& driver, void* queue_base, uint64_t read_dispatch_id,
uint64_t write_dispatch_id) {
uint64_t cur_id = read_dispatch_id;
while (cur_id < write_dispatch_id) {
hsa_amd_aie_ert_packet_t* pkt = static_cast<hsa_amd_aie_ert_packet_t*>(queue_base) + cur_id;
// Get the packet header information
if (pkt->header.header != HSA_PACKET_TYPE_VENDOR_SPECIFIC ||
pkt->header.AmdFormat != HSA_AMD_PACKET_TYPE_AIE_ERT)
return HSA_STATUS_ERROR;
// Get the payload information
switch (pkt->opcode) {
case HSA_AMD_AIE_ERT_START_CU: {
// Iterating over future packets and seeing how many contiguous HSA_AMD_AIE_ERT_START_CU
// packets there are. All can be combined into a single chain.
int num_cont_start_cu_pkts = 1;
int num_operands = 0;
for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) {
hsa_amd_aie_ert_packet_t* peak_pkt =
static_cast<hsa_amd_aie_ert_packet_t*>(queue_base) + peak_pkt_id;
if (peak_pkt->opcode != HSA_AMD_AIE_ERT_START_CU) {
break;
}
num_operands += GetOperandCount(peak_pkt->count);
num_cont_start_cu_pkts++;
}
// Call into the driver to submit from cur_id to write_dispatch_id
if (driver.SubmitCmdChain(pkt, num_cont_start_cu_pkts, num_operands, hw_ctx_handle_) !=
HSA_STATUS_SUCCESS)
return HSA_STATUS_ERROR;
cur_id += num_cont_start_cu_pkts;
break;
}
default: {
return HSA_STATUS_ERROR;
}
}
}
return HSA_STATUS_SUCCESS;
}
void AieAqlQueue::StoreRelease(hsa_signal_value_t value) {
@@ -205,16 +253,15 @@ void AieAqlQueue::StoreRelease(hsa_signal_value_t value) {
hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute,
void *value) {
switch (attribute) {
case HSA_AMD_QUEUE_INFO_AGENT:
*(reinterpret_cast<hsa_agent_t *>(value)) = agent_.public_handle();
break;
case HSA_AMD_QUEUE_INFO_DOORBELL_ID:
// Hardware doorbell supports AQL semantics.
*(reinterpret_cast<uint64_t *>(value)) =
reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
break;
default:
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
case HSA_AMD_QUEUE_INFO_AGENT:
*static_cast<hsa_agent_t*>(value) = agent_.public_handle();
break;
case HSA_AMD_QUEUE_INFO_DOORBELL_ID:
// Hardware doorbell supports AQL semantics.
*static_cast<uint64_t*>(value) = reinterpret_cast<uint64_t>(signal_.hardware_doorbell_ptr);
break;
default:
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
return HSA_STATUS_SUCCESS;
}
+37
Просмотреть файл
@@ -348,6 +348,29 @@ static __forceinline std::string& rtrim(std::string& s) {
static __forceinline std::string& trim(std::string& s) { return ltrim(rtrim(s)); }
/// @brief: Flush the cachelines associated with the
/// provided address, offset, and length
/// @param: base(Input), base address to flush
/// @param: offset(Input), offset of base address to flush
/// @param: len(Input), length of buffer to flush
inline void FlushCpuCache(const void* base, size_t offset, size_t len) {
static long cacheline_size = 0;
if (!cacheline_size) {
long sz = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
if (sz <= 0) return;
cacheline_size = sz;
}
const char* cur = (const char*)base;
cur += offset;
uintptr_t lastline = (uintptr_t)(cur + len - 1) | (cacheline_size - 1);
do {
_mm_clflush((const void*)cur);
cur += cacheline_size;
} while (cur <= (const char*)lastline);
}
} // namespace rocr
template <uint32_t lowBit, uint32_t highBit, typename T>
@@ -394,6 +417,20 @@ inline uint32_t PtrHigh32(const void* p) {
return ptr;
}
/// @brief: Concatenates two numbers of type InType to a number of type OutType
/// @param: hi(Input), To be placed in the upper bits of the output
/// @param: lo(Input), To be placed in the lower bits of the output
/// @return: OutType, Concatenation of hi and lo
template <typename OutType, typename InType>
typename std::enable_if<std::is_integral<OutType>::value && std::is_integral<InType>::value &&
sizeof(OutType) >= 2 * sizeof(InType),
OutType>::type
Concat(InType hi, InType lo) {
OutType res = ((static_cast<OutType>(hi) << sizeof(InType) * 8) | static_cast<OutType>(lo));
return res;
}
#include "atomic_helpers.h"
#endif // HSA_RUNTIME_CORE_UTIL_UTILS_H_