diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index bec2e98128..edee88b730 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -117,7 +117,9 @@ hsa_status_t XdnaDriver::GetAgentProperties(core::Agent &agent) const { return HSA_STATUS_ERROR; } - aie_agent.SetNumCols(aie_metadata.cols); + // Right now can only target N-1 columns as that is the + // number of shim DMAs in npu1 devices. + aie_agent.SetNumCols(aie_metadata.cols - 1); aie_agent.SetNumCoreRows(aie_metadata.core.row_count); return HSA_STATUS_SUCCESS; @@ -147,7 +149,7 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, } if (m_region.kernarg()) { - create_bo_args.type = AMDXDNA_BO_CMD; + create_bo_args.type = AMDXDNA_BO_SHMEM; } else { create_bo_args.type = AMDXDNA_BO_DEV; } @@ -189,11 +191,26 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, } vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem); + vmem_addr_mappings.emplace(mapped_mem, create_bo_args.handle); return HSA_STATUS_SUCCESS; } hsa_status_t XdnaDriver::FreeMemory(void *mem, size_t size) { + auto it = vmem_addr_mappings.find(mem); + if (it == vmem_addr_mappings.end()) return HSA_STATUS_ERROR_INVALID_ALLOCATION; + + auto handle = it->second; + + drm_gem_close close_args = {}; + close_args.handle = handle; + if (ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_args) < 0) { + return HSA_STATUS_ERROR; + } + + vmem_handle_mappings.erase(handle); + vmem_addr_mappings.erase(it); + return HSA_STATUS_SUCCESS; } @@ -207,18 +224,10 @@ hsa_status_t XdnaDriver::CreateQueue(core::Queue &queue) const { // Currently we do not leverage QoS information. amdxdna_qos_info qos_info{0}; - amdxdna_drm_create_hwctx create_hwctx_args{ - .ext = 0, - .ext_flags = 0, - .qos_p = reinterpret_cast(&qos_info), - .umq_bo = 0, - .log_buf_bo = 0, - // TODO: Make this configurable. - .max_opc = 0x800, - // This field is for the number of core tiles. - .num_tiles = aie_agent.GetNumCores(), - .mem_size = 0, - .umq_doorbell = 0}; + amdxdna_drm_create_hwctx create_hwctx_args = {}; + create_hwctx_args.qos_p = reinterpret_cast(&qos_info); + create_hwctx_args.max_opc = 0x800; + create_hwctx_args.num_tiles = static_cast(aie_agent.GetNumCores()); if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &create_hwctx_args) < 0) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; @@ -261,13 +270,9 @@ hsa_status_t XdnaDriver::QueryDriverVersion() { } hsa_status_t XdnaDriver::InitDeviceHeap() { - amdxdna_drm_create_bo create_bo_args{ - .flags = 0, - .type = AMDXDNA_BO_DEV_HEAP, - ._pad = 0, - .vaddr = reinterpret_cast(nullptr), - .size = dev_heap_size, - .handle = 0}; + amdxdna_drm_create_bo create_bo_args = {}; + create_bo_args.size = dev_heap_size; + create_bo_args.type = AMDXDNA_BO_DEV_HEAP; amdxdna_drm_get_bo_info get_bo_info_args{0}; drm_gem_close close_bo_args{0}; @@ -316,19 +321,243 @@ hsa_status_t XdnaDriver::InitDeviceHeap() { return HSA_STATUS_SUCCESS; } +std::unordered_map& XdnaDriver::GetHandleMappings() { + return vmem_handle_mappings; +} + +std::unordered_map& XdnaDriver::GetAddrMappings() { return vmem_addr_mappings; } + hsa_status_t XdnaDriver::FreeDeviceHeap() { if (dev_heap_parent) { - munmap(dev_heap_parent, dev_heap_align * 2 - 1); + if (munmap(dev_heap_parent, dev_heap_align * 2 - 1) != 0) return HSA_STATUS_ERROR; dev_heap_parent = nullptr; } if (dev_heap_aligned) { - munmap(dev_heap_aligned, dev_heap_size); + if (munmap(dev_heap_aligned, dev_heap_size) != 0) return HSA_STATUS_ERROR; dev_heap_aligned = nullptr; } return HSA_STATUS_SUCCESS; } +hsa_status_t XdnaDriver::SyncBos(const std::vector& bo_addrs, + const std::vector& bo_sizes) { + if (bo_addrs.size() != bo_sizes.size()) return HSA_STATUS_ERROR; + + for (int i = 0; i < bo_addrs.size(); i++) { + FlushCpuCache(reinterpret_cast(bo_addrs[i]), 0, bo_sizes[i]); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t XdnaDriver::ExecCmdAndWait(amdxdna_drm_exec_cmd* exec_cmd, uint32_t hw_ctx_handle) { + // Submit the cmd + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_EXEC_CMD, exec_cmd)) return HSA_STATUS_ERROR; + + // Waiting for command to finish + amdxdna_drm_wait_cmd wait_cmd = {}; + wait_cmd.hwctx = hw_ctx_handle; + wait_cmd.timeout = DEFAULT_TIMEOUT_VAL; + wait_cmd.seq = exec_cmd->seq; + + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd)) return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t XdnaDriver::RegisterCmdBOs( + uint32_t count, std::vector& bo_args, std::vector& bo_sizes, + std::vector& bo_addrs, hsa_amd_aie_ert_start_kernel_data_t* cmd_pkt_payload, + const std::unordered_map& vmem_addr_mappings) { + // This is the index where the operand addresses start in a command + const int operand_starting_index = 5; + + // Counting the number of operands in the command payload. + uint32_t num_operands = GetOperandCount(count); + + uint64_t instr_addr = Concat( + cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX + 1], + cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]); + auto instr_handle = vmem_addr_mappings.find(reinterpret_cast(instr_addr)); + + if (instr_handle == vmem_addr_mappings.end()) return HSA_STATUS_ERROR; + + // Keep track of the handles and addresses before we submit the packet + bo_args.push_back(instr_handle->second); + bo_addrs.push_back(instr_addr); + + // Adding the instruction sequence size. The packet contains the number of + // instructions. + uint32_t instr_bo_size = + cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_SIZE_IDX] * INSTR_SIZE_BYTES; + bo_sizes.push_back(instr_bo_size); + + // Going through all of the operands in the command, keeping track of the + // addresses and turning the addresses into handles. The starting index of + // the operands in a command is `operand_starting_index` and the fields + // are 32-bits we need to iterate over every two + for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) { + uint32_t operand_index = operand_starting_index + 2 * operand_iter; + uint64_t operand_addr = Concat(cmd_pkt_payload->data[operand_index + 1], + cmd_pkt_payload->data[operand_index]); + auto operand_handle = vmem_addr_mappings.find(reinterpret_cast(operand_addr)); + if (operand_handle == vmem_addr_mappings.end()) return HSA_STATUS_ERROR; + bo_args.push_back(operand_handle->second); + bo_addrs.push_back(operand_addr); + } + + // Going through all of the operands in the command, keeping track of + // the sizes of each operand. The size is used to sync the buffer + uint32_t operand_size_starting_index = operand_starting_index + 2 * num_operands; + for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) { + bo_sizes.push_back(cmd_pkt_payload->data[operand_size_starting_index + operand_iter]); + } + + // Transform the instruction sequence address into device address + cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX] = + DEV_ADDR_BASE | instr_addr & DEV_ADDR_OFFSET_MASK; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t XdnaDriver::CreateCmd(uint32_t size, uint32_t* handle, amdxdna_cmd** cmd) { + // Creating the command + amdxdna_drm_create_bo create_cmd_bo = {}; + create_cmd_bo.type = AMDXDNA_BO_CMD, create_cmd_bo.size = size; + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo)) return HSA_STATUS_ERROR; + + amdxdna_drm_get_bo_info cmd_bo_get_bo_info = {}; + cmd_bo_get_bo_info.handle = create_cmd_bo.handle; + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_get_bo_info)) return HSA_STATUS_ERROR; + + *cmd = static_cast(mmap(nullptr, create_cmd_bo.size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd_, cmd_bo_get_bo_info.map_offset)); + + if (cmd == MAP_FAILED) return HSA_STATUS_ERROR; + + *handle = create_cmd_bo.handle; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t XdnaDriver::SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uint32_t num_pkts, + uint32_t num_operands, uint32_t hw_ctx_handle) { + // Storing the metadata of the BOs that store the operands and metadata + // of the commands we are going to submit + std::vector bo_args; + std::vector bo_sizes; + std::vector bo_addrs; + bo_args.reserve(num_operands); + bo_sizes.reserve(num_operands); + bo_addrs.reserve(num_operands); + + // Storing the commands that we are going to submit and the + // corresponding metadata + std::vector cmd_handles; + std::vector cmd_sizes; + std::vector cmds; + cmd_handles.reserve(num_pkts); + cmd_sizes.reserve(num_pkts); + cmds.reserve(num_pkts); + + // Iterating over all the contiguous HSA_AMD_AIE_ERT_CMD_CHAIN packets + for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) { + // Getting the current command packet + hsa_amd_aie_ert_packet_t* pkt = first_pkt + pkt_iter; + hsa_amd_aie_ert_start_kernel_data_t* cmd_pkt_payload = + reinterpret_cast(pkt->payload_data); + + // Add the handles for all of the BOs to bo_args as well as rewrite + // the command payload handles to contain the actual virtual addresses + if (RegisterCmdBOs(pkt->count, bo_args, bo_sizes, bo_addrs, cmd_pkt_payload, + vmem_addr_mappings) != HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; + + // Creating a packet that contains the command to execute the kernel + uint32_t cmd_bo_handle = 0; + amdxdna_cmd* cmd = nullptr; + uint32_t cmd_size = sizeof(amdxdna_cmd) + pkt->count * sizeof(uint32_t); + if (CreateCmd(cmd_size, &cmd_bo_handle, &cmd)) return HSA_STATUS_ERROR; + + // Filling in the fields of the command + cmd->state = pkt->state; + cmd->extra_cu_masks = 0; + + // The driver places a structure before each command in a command chain. + // Need to increase the size of the command by the size of this structure. + cmd->count = pkt->count + CMD_COUNT_SIZE_INCREASE; + cmd->opcode = pkt->opcode; + cmd->data[0] = cmd_pkt_payload->cu_mask; + memcpy((cmd->data + 1), cmd_pkt_payload->data, 4 * pkt->count); + + // Keeping track of the handle + cmd_handles.push_back(cmd_bo_handle); + cmds.push_back(cmd); + cmd_sizes.push_back(cmd_size); + } + + // Creating a packet that contains the command chain + uint32_t cmd_chain_bo_handle = 0; + amdxdna_cmd* cmd_chain = nullptr; + int cmd_chain_size = (cmd_handles.size() + 1) * sizeof(uint32_t); + if (CreateCmd(cmd_chain_size, &cmd_chain_bo_handle, &cmd_chain)) return HSA_STATUS_ERROR; + + // Writing information to the command buffer + amdxdna_cmd_chain* cmd_chain_payload = reinterpret_cast(cmd_chain->data); + + // Creating a command chain + cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW; + cmd_chain->extra_cu_masks = 0; + cmd_chain->count = sizeof(amdxdna_cmd_chain) + cmd_handles.size() * sizeof(uint64_t); + cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN; + cmd_chain_payload->command_count = cmd_handles.size(); + cmd_chain_payload->submit_index = 0; + cmd_chain_payload->error_index = 0; + for (int i = 0; i < cmd_handles.size(); i++) { + cmd_chain_payload->data[i] = cmd_handles[i]; + } + + // Syncing BOs before we execute the command + if (SyncBos(bo_addrs, bo_sizes)) return HSA_STATUS_ERROR; + + // Removing duplicates in the bo container. The driver will report + // an error if we provide the same BO handle multiple times. + // This can happen if any of the BOs are the same across jobs + std::sort(bo_args.begin(), bo_args.end()); + bo_args.erase(std::unique(bo_args.begin(), bo_args.end()), bo_args.end()); + + // Filling in the fields to execute the command chain + amdxdna_drm_exec_cmd exec_cmd_0 = {}; + exec_cmd_0.hwctx = hw_ctx_handle; + exec_cmd_0.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF; + exec_cmd_0.cmd_handles = cmd_chain_bo_handle; + exec_cmd_0.args = reinterpret_cast(bo_args.data()); + exec_cmd_0.cmd_count = 1; + exec_cmd_0.arg_count = bo_args.size(); + + // Executing all commands in the command chain + ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle); + + // Unmapping and closing the cmd BOs + drm_gem_close close_bo_args{0}; + for (int i = 0; i < cmd_handles.size(); i++) { + if (munmap(cmds[i], cmd_sizes[i]) != 0) return HSA_STATUS_ERROR; + close_bo_args.handle = cmd_handles[i]; + ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args); + } + + // Unmapping and closing the cmd_chain BO + if (munmap(cmd_chain, cmd_chain_size) != 0) return HSA_STATUS_ERROR; + close_bo_args.handle = cmd_chain_bo_handle; + ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args); + + // Syncing BOs after we execute the command + if (SyncBos(bo_addrs, bo_sizes)) return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + } // namespace AMD } // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h index 9182a0fd7b..cd939b56c2 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h @@ -21,7 +21,6 @@ extern "C" { #define AMDXDNA_DRIVER_MAJOR 1 #define AMDXDNA_DRIVER_MINOR 0 -#define AMDXDNA_INVALID_CMD_HANDLE (~0UL) #define AMDXDNA_INVALID_ADDR (~0UL) #define AMDXDNA_INVALID_CTX_HANDLE 0 #define AMDXDNA_INVALID_BO_HANDLE 0 @@ -50,11 +49,9 @@ enum amdxdna_drm_ioctl_id { DRM_AMDXDNA_GET_BO_INFO, DRM_AMDXDNA_SYNC_BO, DRM_AMDXDNA_EXEC_CMD, - DRM_AMDXDNA_WAIT_CMD, DRM_AMDXDNA_GET_INFO, DRM_AMDXDNA_SET_STATE, - DRM_AMDXDNA_SUBMIT_WAIT, - DRM_AMDXDNA_SUBMIT_SIGNAL, + DRM_AMDXDNA_WAIT_CMD, DRM_AMDXDNA_NUM_IOCTLS }; @@ -96,6 +93,7 @@ struct amdxdna_qos_info { * @mem_size: Size of AIE tile memory. * @umq_doorbell: Returned offset of doorbell associated with UMQ. * @handle: Returned hardware context handle. + * @pad: Structure padding. */ struct amdxdna_drm_create_hwctx { __u64 ext; @@ -108,12 +106,13 @@ struct amdxdna_drm_create_hwctx { __u32 mem_size; __u32 umq_doorbell; __u32 handle; + __u32 pad; }; /** * struct amdxdna_drm_destroy_hwctx - Destroy hardware context. * @handle: Hardware context handle. - * @pad: MBZ. + * @pad: Structure padding. */ struct amdxdna_drm_destroy_hwctx { __u32 handle; @@ -122,9 +121,9 @@ struct amdxdna_drm_destroy_hwctx { /** * struct amdxdna_cu_config - configuration for one CU - * @cu_bo: CU configuration buffer bo handle - * @cu_func: Functional of a CU - * @pad: MBZ + * @cu_bo: CU configuration buffer bo handle. + * @cu_func: Function of a CU. + * @pad: Structure padding. */ struct amdxdna_cu_config { __u32 cu_bo; @@ -135,9 +134,9 @@ struct amdxdna_cu_config { /** * struct amdxdna_hwctx_param_config_cu - configuration for CUs in hardware * context - * @num_cus: Number of CUs to configure - * @pad: MBZ - * @cu_configs: Array of CU configurations of struct amdxdna_cu_config + * @num_cus: Number of CUs to configure. + * @pad: Structure padding. + * @cu_configs: Array of CU configurations of struct amdxdna_cu_config. */ struct amdxdna_hwctx_param_config_cu { __u16 num_cus; @@ -160,6 +159,7 @@ enum amdxdna_drm_config_hwctx_param { * @param_val: A structure specified by the param_type struct member. * @param_val_size: Size of the parameter buffer pointed to by the param_val. * If param_val is not a pointer, driver can ignore this. + * @pad: Structure padding. * * Note: if the param_val is a pointer pointing to a buffer, the maximum size * of the buffer is 4KiB(PAGE_SIZE). @@ -191,17 +191,16 @@ enum amdxdna_bo_type { /** * struct amdxdna_drm_create_bo - Create a buffer object. * @flags: Buffer flags. MBZ. - * @type: Buffer type. * @vaddr: User VA of buffer if applied. MBZ. * @size: Size in bytes. + * @type: Buffer type. * @handle: Returned DRM buffer object handle. */ struct amdxdna_drm_create_bo { __u64 flags; - __u32 type; - __u32 _pad; __u64 vaddr; __u64 size; + __u32 type; __u32 handle; }; @@ -210,6 +209,7 @@ struct amdxdna_drm_create_bo { * @ext: MBZ. * @ext_flags: MBZ. * @handle: DRM buffer object handle. + * @pad: Structure padding. * @map_offset: Returned DRM fake offset for mmap(). * @vaddr: Returned user VA of buffer. 0 in case user needs mmap(). * @xdna_addr: Returned XDNA device virtual address. @@ -218,7 +218,7 @@ struct amdxdna_drm_get_bo_info { __u64 ext; __u64 ext_flags; __u32 handle; - __u32 _pad; + __u32 pad; __u64 map_offset; __u64 vaddr; __u64 xdna_addr; @@ -252,8 +252,8 @@ enum amdxdna_cmd_type { * @ext_flags: MBZ. * @hwctx: Hardware context handle. * @type: One of command type in enum amdxdna_cmd_type. - * @cmd_handles: Array of command handles or the command handle itself in case - * of just one. + * @cmd_handles: Array of command handles or the command handle itself + * in case of just one. * @args: Array of arguments for all command handles. * @cmd_count: Number of command handles in the cmd_handles array. * @arg_count: Number of arguments in the args array. @@ -279,8 +279,6 @@ struct amdxdna_drm_exec_cmd { * @seq: sequence number of the command returned by execute command. * * Wait a command specified by seq to be completed. - * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot - * to submit a new command. */ struct amdxdna_drm_wait_cmd { __u32 hwctx; @@ -290,10 +288,9 @@ struct amdxdna_drm_wait_cmd { /** * struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware - * @buffer: The user space buffer that will return the AIE status - * @buffer_size: The size of the user space buffer - * @cols_filled: A bitmap of AIE columns whose data has been returned in the - * buffer. + * @buffer: The user space buffer that will return the AIE status. + * @buffer_size: The size of the user space buffer. + * @cols_filled: A bitmap of AIE columns whose data has been returned in the buffer. */ struct amdxdna_drm_query_aie_status { __u64 buffer; /* out */ @@ -303,8 +300,8 @@ struct amdxdna_drm_query_aie_status { /** * struct amdxdna_drm_query_aie_version - Query the version of the AIE hardware - * @major: The major version number - * @minor: The minor version number + * @major: The major version number. + * @minor: The minor version number. */ struct amdxdna_drm_query_aie_version { __u32 major; /* out */ @@ -319,7 +316,7 @@ struct amdxdna_drm_query_aie_version { * @dma_channel_count: The number of dma channels. * @lock_count: The number of locks. * @event_reg_count: The number of events. - * @pad: MBZ. + * @pad: Structure padding. */ struct amdxdna_drm_query_aie_tile_metadata { __u16 row_count; @@ -331,8 +328,7 @@ struct amdxdna_drm_query_aie_tile_metadata { }; /** - * struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE - * hardware + * struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE hardware * @col_size: The size of a column in bytes. * @cols: The total number of columns. * @rows: The total number of rows. @@ -355,7 +351,7 @@ struct amdxdna_drm_query_aie_metadata { * struct amdxdna_drm_query_clock - Metadata for a clock * @name: The clock name. * @freq_mhz: The clock frequency. - * @pad: MBZ. + * @pad: Structure padding. */ struct amdxdna_drm_query_clock { __u8 name[16]; @@ -381,14 +377,12 @@ enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER }; * @input: The current value of the sensor. * @max: The maximum value possible for the sensor. * @average: The average value of the sensor. - * @highest: The highest recorded sensor value for this driver load for the - * sensor. + * @highest: The highest recorded sensor value for this driver load for the sensor. * @status: The sensor status. * @units: The sensor units. - * @unitm: Translates value member variables into the correct unit via (pow(10, - * unitm) * value) - * @type: The sensor type from enum amdxdna_sensor_type - * @pad: MBZ. + * @unitm: Translates value member variables into the correct unit via (pow(10, unitm) * value). + * @type: The sensor type from enum amdxdna_sensor_type. + * @pad: Structure padding. */ struct amdxdna_drm_query_sensor { __u8 label[64]; @@ -408,14 +402,14 @@ struct amdxdna_drm_query_sensor { * @context_id: The ID for this context. * @start_col: The starting column for the partition assigned to this context. * @num_col: The number of columns in the partition assigned to this context. + * @pad: Structure padding. * @pid: The Process ID of the process that created this context. * @command_submissions: The number of commands submitted to this context. * @command_completions: The number of commands completed by this context. - * @migrations: The number of times this context has been moved to a different - * partition. - * @preemptions: The number of times this context has been preempted by another - * context in the same partition. - * @pad: MBZ. + * @migrations: The number of times this context has been moved to a different partition. + * @preemptions: The number of times this context has been preempted by another context in the + * same partition. + * @errors: The errors for this context. */ struct amdxdna_drm_query_hwctx { __u32 context_id; @@ -471,6 +465,7 @@ enum amdxdna_power_mode_type { POWER_MODE_LOW, /**< Set frequency to lowest DPM */ POWER_MODE_MEDIUM, /**< Set frequency to medium DPM */ POWER_MODE_HIGH, /**< Set frequency to highest DPM */ + POWER_MODE_TURBO, /**< More power, more performance */ }; /** @@ -508,13 +503,13 @@ enum amdxdna_drm_get_param { DRM_AMDXDNA_READ_AIE_REG, DRM_AMDXDNA_QUERY_FIRMWARE_VERSION, DRM_AMDXDNA_GET_POWER_MODE, + DRM_AMDXDNA_QUERY_TELEMETRY, DRM_AMDXDNA_NUM_GET_PARAM, }; /** * struct amdxdna_drm_get_info - Get some information from the AIE hardware. - * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed - * in the buffer. + * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed in the buffer. * @buffer_size: Size of the input buffer. Size needed/written by the kernel. * @buffer: A structure specified by the param struct member. */ @@ -542,10 +537,8 @@ enum amdxdna_drm_set_param { }; /** - * struct amdxdna_drm_set_state - Set the state of some component within the AIE - * hardware. - * @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed - * in the buffer. + * struct amdxdna_drm_set_state - Set the state of some component within the AIE hardware. + * @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed in the buffer. * @buffer_size: Size of the input buffer. * @buffer: A structure specified by the param struct member. */ @@ -555,63 +548,35 @@ struct amdxdna_drm_set_state { __u64 buffer; /* in */ }; -/** - * struct amdxdna_drm_syncobjs - Signal or wait on array of DRM timelined sync - * objects. - * @handles: Array of handles of sync objects. - * @points: Array of time points for each sync objects. - * @count: Number of elements in the above array. - */ -struct amdxdna_drm_syncobjs { - __u64 handles; /* in */ - __u64 points; /* in */ - __u32 count; /* in */ - __u32 pad; -}; +#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, struct amdxdna_drm_create_hwctx) -#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \ - struct amdxdna_drm_create_hwctx) +#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, struct amdxdna_drm_destroy_hwctx) -#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \ - struct amdxdna_drm_destroy_hwctx) +#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, struct amdxdna_drm_config_hwctx) -#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \ - struct amdxdna_drm_config_hwctx) +#define DRM_IOCTL_AMDXDNA_CREATE_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, struct amdxdna_drm_create_bo) -#define DRM_IOCTL_AMDXDNA_CREATE_BO \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \ - struct amdxdna_drm_create_bo) +#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, struct amdxdna_drm_get_bo_info) -#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \ - struct amdxdna_drm_get_bo_info) - -#define DRM_IOCTL_AMDXDNA_SYNC_BO \ +#define DRM_IOCTL_AMDXDNA_SYNC_BO \ DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, struct amdxdna_drm_sync_bo) -#define DRM_IOCTL_AMDXDNA_EXEC_CMD \ +#define DRM_IOCTL_AMDXDNA_EXEC_CMD \ DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, struct amdxdna_drm_exec_cmd) -#define DRM_IOCTL_AMDXDNA_WAIT_CMD \ +#define DRM_IOCTL_AMDXDNA_WAIT_CMD \ DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, struct amdxdna_drm_wait_cmd) -#define DRM_IOCTL_AMDXDNA_GET_INFO \ +#define DRM_IOCTL_AMDXDNA_GET_INFO \ DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, struct amdxdna_drm_get_info) -#define DRM_IOCTL_AMDXDNA_SET_STATE \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \ - struct amdxdna_drm_set_state) - -#define DRM_IOCTL_AMDXDNA_SUBMIT_WAIT \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_WAIT, \ - struct amdxdna_drm_syncobjs) - -#define DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_SIGNAL, \ - struct amdxdna_drm_syncobjs) +#define DRM_IOCTL_AMDXDNA_SET_STATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, struct amdxdna_drm_set_state) #if defined(__cplusplus) } /* extern c end */ diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_agent.h index fe89931fc5..798022754c 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_agent.h @@ -100,6 +100,9 @@ public: return system_allocator_; } + /// @brief Getter for the AIE system deallocator. + const std::function& system_deallocator() const { return system_deallocator_; } + // AIE agent methods. /// @brief Get the number of columns on this AIE agent. uint32_t GetNumCols() const { return num_cols_; } @@ -124,6 +127,9 @@ private: core::MemoryRegion::AllocateFlags flags)> system_allocator_; + + std::function system_deallocator_; + const hsa_profile_t profile_ = HSA_PROFILE_BASE; const uint32_t min_aql_size_ = 0x40; const uint32_t max_aql_size_ = 0x40; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index 79f328ccb8..c0b14db26c 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -49,7 +49,6 @@ #include "core/inc/queue.h" #include "core/inc/runtime.h" #include "core/inc/signal.h" -#include "core/util/locks.h" namespace rocr { namespace AMD { @@ -131,6 +130,12 @@ private: /// @brief Base of the queue's ring buffer storage. void *ring_buf_ = nullptr; + /// @brief Called when the doorbell is rung to iterate over + /// all packets and submit them. Submissions is done by + // calling into the XdnaDriver. + hsa_status_t SubmitCmd(XdnaDriver& driver, void* queue_base, uint64_t read_dispatch_id, + uint64_t write_dispatch_id); + /// @brief Handle for an application context on the AIE device. /// /// Each user queue will have an associated context. This handle is assigned @@ -154,4 +159,4 @@ private: } // namespace AMD } // namespace rocr -#endif // header guard +#endif // HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_ diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index ae3197ea95..4c0cad47fb 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -45,9 +45,46 @@ #include #include +#include "core/driver/xdna/uapi/amdxdna_accel.h" +#include "core/inc/amd_aie_agent.h" #include "core/inc/driver.h" #include "core/inc/memory_region.h" +/// @brief struct amdxdna_cmd_chain - Interpretation of data payload for +/// ERT_CMD_CHAIN +struct amdxdna_cmd_chain { + /// Number of commands in chain + __u32 command_count; + /// Index of last successfully submitted command in chain + __u32 submit_index; + /// Index of failing command if cmd status is not completed + __u32 error_index; + __u32 reserved[3]; + /// Address of each command in chain + __u64 data[] __counted_by(command_count); +}; + +/// @brief struct amdxdna_cmd - Exec buffer command header format +struct amdxdna_cmd { + union { + struct { + /// Current state of a command + __u32 state : 4; + __u32 unused : 6; + /// Extra CU masks in addition to mandatory mask + __u32 extra_cu_masks : 2; + /// Number of words in payload (data) + __u32 count : 11; + /// Opcode identifying specific command + __u32 opcode : 5; + __u32 reserved : 4; + }; + __u32 header; + }; + /// Count number of words representing packet payload + __u32 data[] __counted_by(count); +}; + namespace rocr { namespace core { class Queue; @@ -55,6 +92,40 @@ class Queue; namespace AMD { +/// @brief: The number of arguments in the packet payload before we start passing operands +constexpr uint32_t NON_OPERAND_COUNT = 6; + +// @brief: Used to transform an address into a device address +constexpr uint32_t DEV_ADDR_BASE = 0x04000000; +constexpr uint32_t DEV_ADDR_OFFSET_MASK = 0x02FFFFFF; + +/// @brief: The driver places a structure before each command in a command chain. +/// Need to increase the size of the command by the size of this structure. +/// In the following xdna driver source can see where this is implemented: +/// Commit hash: eddd92c0f61592c576a500f16efa24eb23667c23 +/// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_msg_priv.h#L387-L391 +/// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L637 +constexpr uint32_t CMD_COUNT_SIZE_INCREASE = 3; + +/// @brief: The size of an instruction in bytes +constexpr uint32_t INSTR_SIZE_BYTES = 4; + +/// @brief: Index of command payload where the instruction sequence +/// address is located +constexpr uint32_t CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2; +constexpr uint32_t CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_SIZE_IDX = 4; + +/// @brief Environment variable to define job submission timeout +constexpr uint32_t DEFAULT_TIMEOUT_VAL = 50; + +/// @brief: Calculates the number of operands in a packet +/// given the number of arguments in the packet +/// @param: arg_count(Input), Number of arguments in the packet +/// @return: uint32_t, The number of operands in the packet +inline uint32_t GetOperandCount(uint32_t arg_count) { + return ((arg_count - NON_OPERAND_COUNT) / 2); +} + class XdnaDriver final : public core::Driver { public: XdnaDriver(std::string devnode_name); @@ -68,6 +139,9 @@ public: hsa_status_t Init() override; hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override; + std::unordered_map& GetHandleMappings(); + std::unordered_map& GetAddrMappings(); + hsa_status_t GetAgentProperties(core::Agent &agent) const override; hsa_status_t GetMemoryProperties(uint32_t node_id, @@ -84,7 +158,11 @@ public: hsa_status_t CreateQueue(core::Queue &queue) const override; hsa_status_t DestroyQueue(core::Queue &queue) const override; -private: + // @brief Submits num_pkts packets in a command chain to the XDNA driver + hsa_status_t SubmitCmdChain(hsa_amd_aie_ert_packet_t* first_pkt, uint32_t num_pkts, + uint32_t num_operands, uint32_t hw_ctx_handle); + + private: hsa_status_t QueryDriverVersion(); /// @brief Allocate device accesible heap space. /// @@ -92,27 +170,55 @@ private: hsa_status_t InitDeviceHeap(); hsa_status_t FreeDeviceHeap(); + /// @brief Creates a command BO and returns a pointer to the memory and + // the corresponding handle + /// + /// @param size size of memory to allocate + /// @param handle A pointer to the BO handle + /// @param cmd A pointer to the buffer + hsa_status_t CreateCmd(uint32_t size, uint32_t* handle, amdxdna_cmd** cmd); + + /// @brief Adds all BOs in a command packet payload to a vector + /// and replaces the handles with a virtual address + /// + /// @param count Number of entries in the command + /// @param bo_args A pointer to a vector that contains all bo handles + /// @param cmd_pkt_payload A pointer to the payload of the command + hsa_status_t RegisterCmdBOs(uint32_t count, std::vector& bo_args, + std::vector& bo_sizes, std::vector& bo_addrs, + hsa_amd_aie_ert_start_kernel_data_t* cmd_pkt_payload, + const std::unordered_map& vmem_addr_mappings); + + /// @brief Syncs all BOs referenced in bo_args + /// + /// @param bo_args vector containing handles of BOs to sync + hsa_status_t SyncBos(const std::vector& bo_args, const std::vector& bo_sizes); + + /// @brief Executes a command and waits for its completion + /// + /// @param exec_cmd Structure containing the details of the command to execute + /// @param hw_ctx_handle the handle of the hardware context to run this + /// command + hsa_status_t ExecCmdAndWait(amdxdna_drm_exec_cmd* exec_cmd, uint32_t hw_ctx_handle); + /// TODO: Remove this in the future and rely on the core Runtime /// object to track handle allocations. Using the VMEM API for mapping XDNA /// driver handles requires a bit more refactoring. So rely on the XDNA driver /// to manage some of this for now. std::unordered_map vmem_handle_mappings; + std::unordered_map vmem_addr_mappings; /// @brief Virtual address range allocated for the device heap. /// /// Allocate a large enough space so we can carve out the device heap in - /// this range and ensure it is aligned to 64MB. Currently, AIE2 supports - /// 48MB device heap and it must be aligned to 64MB. + /// this range and ensure it is aligned to 64MB. Currently, npu1 supports + /// 64MB device heap and it must be aligned to 64MB. void *dev_heap_parent = nullptr; /// @brief The aligned device heap. void *dev_heap_aligned = nullptr; - static constexpr size_t dev_heap_size = 48 * 1024 * 1024; + static constexpr size_t dev_heap_size = 64 * 1024 * 1024; static constexpr size_t dev_heap_align = 64 * 1024 * 1024; - - /// @brief DRM buffer object handle for the device heap. Assigned by the - /// kernel-mode driver. - uint32_t dev_heap_handle = 0; }; } // namespace AMD diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp index 7b11dc7cbd..3c823b464f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp @@ -322,6 +322,8 @@ void AieAgent::InitAllocators() { ? mem : nullptr; }; + + system_deallocator_ = [](void* ptr) { core::Runtime::runtime_singleton_->FreeMemory(ptr); }; break; } } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index 4018c7b193..fb823a6832 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -41,22 +41,19 @@ //////////////////////////////////////////////////////////////////////////////// #include "core/inc/amd_aie_aql_queue.h" +#include "core/inc/amd_xdna_driver.h" #ifdef __linux__ #include #include #include -#include -#include #endif #ifdef _WIN32 #include #endif -#include -#include -#include +#include #include "core/inc/queue.h" #include "core/inc/runtime.h" @@ -104,7 +101,12 @@ AieAqlQueue::AieAqlQueue(AieAgent *agent, size_t req_size_pkts, drv.CreateQueue(*this); } -AieAqlQueue::~AieAqlQueue() { Inactivate(); } +AieAqlQueue::~AieAqlQueue() { + AieAqlQueue::Inactivate(); + if (ring_buf_) { + agent_.system_deallocator()(ring_buf_); + } +} hsa_status_t AieAqlQueue::Inactivate() { bool active(active_.exchange(false, std::memory_order_relaxed)); @@ -193,8 +195,54 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) { } void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { - atomic::Store(signal_.hardware_doorbell_ptr, uint64_t(value), - std::memory_order_release); + auto& driver = static_cast(agent_.driver()); + SubmitCmd(driver, amd_queue_.hsa_queue.base_address, amd_queue_.read_dispatch_id, + amd_queue_.write_dispatch_id); +} + +hsa_status_t AieAqlQueue::SubmitCmd(XdnaDriver& driver, void* queue_base, uint64_t read_dispatch_id, + uint64_t write_dispatch_id) { + uint64_t cur_id = read_dispatch_id; + while (cur_id < write_dispatch_id) { + hsa_amd_aie_ert_packet_t* pkt = static_cast(queue_base) + cur_id; + + // Get the packet header information + if (pkt->header.header != HSA_PACKET_TYPE_VENDOR_SPECIFIC || + pkt->header.AmdFormat != HSA_AMD_PACKET_TYPE_AIE_ERT) + return HSA_STATUS_ERROR; + + // Get the payload information + switch (pkt->opcode) { + case HSA_AMD_AIE_ERT_START_CU: { + // Iterating over future packets and seeing how many contiguous HSA_AMD_AIE_ERT_START_CU + // packets there are. All can be combined into a single chain. + int num_cont_start_cu_pkts = 1; + int num_operands = 0; + for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) { + hsa_amd_aie_ert_packet_t* peak_pkt = + static_cast(queue_base) + peak_pkt_id; + if (peak_pkt->opcode != HSA_AMD_AIE_ERT_START_CU) { + break; + } + num_operands += GetOperandCount(peak_pkt->count); + num_cont_start_cu_pkts++; + } + + // Call into the driver to submit from cur_id to write_dispatch_id + if (driver.SubmitCmdChain(pkt, num_cont_start_cu_pkts, num_operands, hw_ctx_handle_) != + HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; + + cur_id += num_cont_start_cu_pkts; + break; + } + default: { + return HSA_STATUS_ERROR; + } + } + } + + return HSA_STATUS_SUCCESS; } void AieAqlQueue::StoreRelease(hsa_signal_value_t value) { @@ -205,16 +253,15 @@ void AieAqlQueue::StoreRelease(hsa_signal_value_t value) { hsa_status_t AieAqlQueue::GetInfo(hsa_queue_info_attribute_t attribute, void *value) { switch (attribute) { - case HSA_AMD_QUEUE_INFO_AGENT: - *(reinterpret_cast(value)) = agent_.public_handle(); - break; - case HSA_AMD_QUEUE_INFO_DOORBELL_ID: - // Hardware doorbell supports AQL semantics. - *(reinterpret_cast(value)) = - reinterpret_cast(signal_.hardware_doorbell_ptr); - break; - default: - return HSA_STATUS_ERROR_INVALID_ARGUMENT; + case HSA_AMD_QUEUE_INFO_AGENT: + *static_cast(value) = agent_.public_handle(); + break; + case HSA_AMD_QUEUE_INFO_DOORBELL_ID: + // Hardware doorbell supports AQL semantics. + *static_cast(value) = reinterpret_cast(signal_.hardware_doorbell_ptr); + break; + default: + return HSA_STATUS_ERROR_INVALID_ARGUMENT; } return HSA_STATUS_SUCCESS; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/utils.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/utils.h index a1479d187e..66c2028a24 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/util/utils.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/utils.h @@ -348,6 +348,29 @@ static __forceinline std::string& rtrim(std::string& s) { static __forceinline std::string& trim(std::string& s) { return ltrim(rtrim(s)); } +/// @brief: Flush the cachelines associated with the +/// provided address, offset, and length +/// @param: base(Input), base address to flush +/// @param: offset(Input), offset of base address to flush +/// @param: len(Input), length of buffer to flush +inline void FlushCpuCache(const void* base, size_t offset, size_t len) { + static long cacheline_size = 0; + + if (!cacheline_size) { + long sz = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + if (sz <= 0) return; + cacheline_size = sz; + } + + const char* cur = (const char*)base; + cur += offset; + uintptr_t lastline = (uintptr_t)(cur + len - 1) | (cacheline_size - 1); + do { + _mm_clflush((const void*)cur); + cur += cacheline_size; + } while (cur <= (const char*)lastline); +} + } // namespace rocr template @@ -394,6 +417,20 @@ inline uint32_t PtrHigh32(const void* p) { return ptr; } +/// @brief: Concatenates two numbers of type InType to a number of type OutType +/// @param: hi(Input), To be placed in the upper bits of the output +/// @param: lo(Input), To be placed in the lower bits of the output +/// @return: OutType, Concatenation of hi and lo +template +typename std::enable_if::value && std::is_integral::value && + sizeof(OutType) >= 2 * sizeof(InType), + OutType>::type +Concat(InType hi, InType lo) { + OutType res = ((static_cast(hi) << sizeof(InType) * 8) | static_cast(lo)); + return res; +} + + #include "atomic_helpers.h" #endif // HSA_RUNTIME_CORE_UTIL_UTILS_H_