From 240dc71b91a5b9eca840a0dc2e119d616b52e8e8 Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Thu, 5 Dec 2024 18:02:00 +0800 Subject: [PATCH 01/32] wsl/hsakmt: move src/inc to include/impl Signed-off-by: Flora Cui Reviewed-by: Horatio Zhang Part-of: --- pm4_cmds.h | 1090 +++++++++++++++++++++++++++++++++++++ registers.h | 363 ++++++++++++ thunk_proxy/thunk_proxy.h | 128 +++++ thunk_proxy/wddm_types.h | 155 ++++++ wddm/cmd_util.h | 83 +++ wddm/device.h | 257 +++++++++ wddm/gpu_memory.h | 227 ++++++++ wddm/queue.h | 363 ++++++++++++ wddm/status.h | 60 ++ wddm/thunks.h | 232 ++++++++ wddm/types.h | 101 ++++ wddm/va_mgr.h | 86 +++ 12 files changed, 3145 insertions(+) create mode 100644 pm4_cmds.h create mode 100644 registers.h create mode 100644 thunk_proxy/thunk_proxy.h create mode 100644 thunk_proxy/wddm_types.h create mode 100644 wddm/cmd_util.h create mode 100644 wddm/device.h create mode 100644 wddm/gpu_memory.h create mode 100644 wddm/queue.h create mode 100644 wddm/status.h create mode 100644 wddm/thunks.h create mode 100644 wddm/types.h create mode 100644 wddm/va_mgr.h diff --git a/pm4_cmds.h b/pm4_cmds.h new file mode 100644 index 0000000000..44b7fb00aa --- /dev/null +++ b/pm4_cmds.h @@ -0,0 +1,1090 @@ +#ifndef _WSL_INC_PM4_CMDS_H_ +#define _WSL_INC_PM4_CMDS_H_ + +#include + +#define mmCOMPUTE_NUM_THREAD_X 0x2E07 +#define mmCOMPUTE_PGM_LO 0x2E0C +#define mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO 0x2E10 +#define mmCOMPUTE_PGM_RSRC1 0x2E12 +#define mmCOMPUTE_PGM_RSRC3 0x2E28 +#define mmCOMPUTE_RESOURCE_LIMITS 0x2E15 +#define mmCOMPUTE_USER_DATA_0 0x2E40 + +#define PM4_TYPE_SHIFT 30 +#define PM4_COUNT_SHIFT 16 +#define PM4_OPCODE_SHIFT 8 +#define PM4_SHADER_TYPE_SHIFT 1 + +#define PM4_GFX_SHADER 0 +#define PM4_COMPUTE_SHADER 1 + +#define PM4_TYPE3_HDR(_opc_, _count_) \ + (uint32_t)((3) << PM4_TYPE_SHIFT | \ + ((_count_) - 2) << PM4_COUNT_SHIFT | \ + (_opc_) << PM4_OPCODE_SHIFT) | \ + (PM4_COMPUTE_SHADER << PM4_SHADER_TYPE_SHIFT) + +union PM4_MEC_TYPE_3_HEADER { + struct { + uint32_t reserved1 : 8; ///< reserved + uint32_t opcode : 8; ///< IT opcode + uint32_t count : 14;///< number of DWORDs - 1 in the information body. + uint32_t type : 2; ///< packet identifier. It should be 3 for type 3 packets + }; + uint32_t u32All; +}; + +#define IT_DISPATCH_DIRECT 0x15 +#define IT_ATOMIC_MEM 0x1E +#define IT_WRITE_DATA 0x37 +#define IT_INDIRECT_BUFFER 0x3F +#define IT_COPY_DATA 0x40 +#define IT_EVENT_WRITE 0x46 +#define IT_RELEASE_MEM 0x49 +#define IT_ACQUIRE_MEM 0x58 +#define IT_SET_SH_REG 0x76 + +struct PM4_MEC_SET_SH_REG { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t reg_offset:16; + uint32_t reserved1:16; + } bitfields2; + uint32_t ordinal2; + }; +}; + +struct PM4_MEC_DISPATCH_DIRECT { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + uint32_t dim_x; + uint32_t dim_y; + uint32_t dim_z; + uint32_t dispatch_initiator; +}; + +// ------------------------------- MEC_EVENT_WRITE_event_index_enum ------------------------------- +enum MEC_EVENT_WRITE_event_index_enum { + event_index__mec_event_write__other = 0, + event_index__mec_event_write__sample_pipelinestat = 2, + event_index__mec_event_write__cs_partial_flush = 4, + event_index__mec_event_write__sample_streamoutstats__GFX11 = 8, + event_index__mec_event_write__sample_streamoutstats1__GFX11 = 9, + event_index__mec_event_write__sample_streamoutstats2__GFX11 = 10, + event_index__mec_event_write__sample_streamoutstats3__GFX11 = 11, +}; + +enum VGT_EVENT_TYPE { + Reserved_0x00 = 0x00000000, + SAMPLE_STREAMOUTSTATS1 = 0x00000001, + SAMPLE_STREAMOUTSTATS2 = 0x00000002, + SAMPLE_STREAMOUTSTATS3 = 0x00000003, + CACHE_FLUSH_TS = 0x00000004, + CONTEXT_DONE = 0x00000005, + CACHE_FLUSH = 0x00000006, + CS_PARTIAL_FLUSH = 0x00000007, + VGT_STREAMOUT_SYNC = 0x00000008, + VGT_STREAMOUT_RESET = 0x0000000a, + END_OF_PIPE_INCR_DE = 0x0000000b, + END_OF_PIPE_IB_END = 0x0000000c, + RST_PIX_CNT = 0x0000000d, + BREAK_BATCH = 0x0000000e, + VS_PARTIAL_FLUSH = 0x0000000f, + PS_PARTIAL_FLUSH = 0x00000010, + FLUSH_HS_OUTPUT = 0x00000011, + FLUSH_DFSM = 0x00000012, + RESET_TO_LOWEST_VGT = 0x00000013, + CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014, + CACHE_FLUSH_AND_INV_EVENT = 0x00000016, + PERFCOUNTER_START = 0x00000017, + PERFCOUNTER_STOP = 0x00000018, + PIPELINESTAT_START = 0x00000019, + PIPELINESTAT_STOP = 0x0000001a, + PERFCOUNTER_SAMPLE = 0x0000001b, + SAMPLE_PIPELINESTAT = 0x0000001e, + SO_VGTSTREAMOUT_FLUSH = 0x0000001f, + SAMPLE_STREAMOUTSTATS = 0x00000020, + RESET_VTX_CNT = 0x00000021, + BLOCK_CONTEXT_DONE = 0x00000022, + CS_CONTEXT_DONE = 0x00000023, + VGT_FLUSH = 0x00000024, + TGID_ROLLOVER = 0x00000025, + SQ_NON_EVENT = 0x00000026, + SC_SEND_DB_VPZ = 0x00000027, + BOTTOM_OF_PIPE_TS = 0x00000028, + FLUSH_SX_TS = 0x00000029, + DB_CACHE_FLUSH_AND_INV = 0x0000002a, + FLUSH_AND_INV_DB_DATA_TS = 0x0000002b, + FLUSH_AND_INV_DB_META = 0x0000002c, + FLUSH_AND_INV_CB_DATA_TS = 0x0000002d, + FLUSH_AND_INV_CB_META = 0x0000002e, + CS_DONE = 0x0000002f, + PS_DONE = 0x00000030, + FLUSH_AND_INV_CB_PIXEL_DATA = 0x00000031, + SX_CB_RAT_ACK_REQUEST = 0x00000032, + THREAD_TRACE_START = 0x00000033, + THREAD_TRACE_STOP = 0x00000034, + THREAD_TRACE_MARKER = 0x00000035, + THREAD_TRACE_FINISH = 0x00000037, + PIXEL_PIPE_STAT_CONTROL = 0x00000038, + PIXEL_PIPE_STAT_DUMP = 0x00000039, + PIXEL_PIPE_STAT_RESET = 0x0000003a, + CONTEXT_SUSPEND = 0x0000003b, + OFFCHIP_HS_DEALLOC = 0x0000003c, + ENABLE_NGG_PIPELINE = 0x0000003d, + SET_FE_ID__GFX09 = 0x00000009, + Available_0x1c__GFX09 = 0x0000001c, + Available_0x1d__GFX09 = 0x0000001d, + THREAD_TRACE_FLUSH__GFX09 = 0x00000036, + Reserved_0x3f__GFX09 = 0x0000003f, + ZPASS_DONE__GFX09_10 = 0x00000015, + ENABLE_LEGACY_PIPELINE__GFX09_10 = 0x0000003e, + Reserved_0x09__GFX10PLUS = 0x00000009, + FLUSH_ES_OUTPUT__GFX10PLUS = 0x0000001c, + BIN_CONF_OVERRIDE_CHECK__GFX10PLUS = 0x0000001d, + THREAD_TRACE_DRAW__GFX10PLUS = 0x00000036, + DRAW_DONE__GFX10PLUS = 0x0000003f, + WAIT_SYNC__GFX11 = 0x00000015, + ENABLE_PIPELINE_NOT_USED__GFX11 = 0x0000003e, +}; + +struct PM4_MEC_EVENT_WRITE { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t reserved2:19; + uint32_t offload_enable:1; + } bitfields2; + uint32_t ordinal2; + }; +}; + +struct PM4_MEC_ATOMIC_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t atomic:7; + uint32_t reserved1:1; + uint32_t command:4; + uint32_t reserved2:13; + uint32_t cache_policy:2; + uint32_t reserved3:5; + } bitfields2; + uint32_t ordinal2; + }; + uint32_t addr_lo; + uint32_t addr_hi; + uint32_t src_data_lo; + uint32_t src_data_hi; + uint32_t cmp_data_lo; + uint32_t cmp_data_hi; + union { + struct { + uint32_t loop_interval:13; + uint32_t reserved4:19; + } bitfields9; + uint32_t ordinal9; + }; +}; + +struct PM4_MEC_WRITE_DATA { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t reserved1:8; + uint32_t dst_sel:4; + uint32_t reserved2:4; + uint32_t addr_incr:1; + uint32_t reserved3:2; + uint32_t resume_vf:1; + uint32_t wr_confirm:1; + uint32_t reserved4:4; + uint32_t cache_policy:2; + uint32_t reserved5:5; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t dst_mmreg_addr:18; + uint32_t reserved6:14; + } bitfields3a; + struct { + uint32_t dst_gds_addr:16; + uint32_t reserved7:16; + } bitfields3b; + struct { + uint32_t reserved8:2; + uint32_t dst_mem_addr_lo:30; + } bitfields3c; + uint32_t ordinal3; + }; + uint32_t dst_mem_addr_hi; + uint64_t write_data_value; +}; + +#define PERSISTENT_SPACE_START 0x00002c00 + +template +void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) { + pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_SH_REG, + sizeof(T) / sizeof(uint32_t)); + pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - PERSISTENT_SPACE_START; +} + +template +void GenerateCmdHeader(T* pm4, int op_code) { + pm4->header.u32All = PM4_TYPE3_HDR(op_code, sizeof(T) / sizeof(uint32_t)); +} + +/// @brief Defines the Gpu command to dispatch a kernel. It embeds +/// various Gpu hardware specific data structures for initialization +/// and configuration before a dispatch begins to run +struct DispatchTemplate { + + /// @brief Structure used to initialize the group dimensions + /// of a kernel dispatch and if performance counters are enabled + struct DispatchDimensionRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_num_thread_x; + uint32_t compute_num_thread_y; + uint32_t compute_num_thread_z; + } dimension_regs; + + struct DispatchProgramRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_pgm_lo; + uint32_t compute_pgm_hi; + } program_regs; + + struct DispatchProgramResourceRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_pgm_rsrc1; + uint32_t compute_pgm_rsrc2; + } program_resource_regs; + + /// @brief Structure used to initialize parameters related to + /// thread management i.e. number of waves to issue and number + /// of Compute Units to use + struct DispatchResourceRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_resource_limits; + uint32_t compute_static_thread_mgmt_se0; + uint32_t compute_static_thread_mgmt_se1; + uint32_t compute_tmpring_size; + uint32_t compute_static_thread_mgmt_se2; + uint32_t compute_static_thread_mgmt_se3; + } resource_regs; + + /// @brief Structure used to pass handles of the Aql dispatch + /// packet, Aql queue, Kernel argument address block, Scratch + /// buffer + struct DispatchComputeUserDataRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_user_data[16]; + } compute_user_data_regs; + + /// @brief Structure used to configure Cache flush policy + /// and dimensions of total work size + PM4_MEC_DISPATCH_DIRECT dispatch_direct; +}; + +struct DispatchProgramResourceRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_pgm_rsrc3; +}; + + +/// @brief Structure used to issue a programing scratch command for gfx11+ +struct SetScratchTemplate { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t scratch_lo; + uint32_t scratch_hi; +}; + +/// @brief Structure used to issue a Gpu Barrier command +struct BarrierTemplate { + PM4_MEC_EVENT_WRITE event_write; +}; + +//--------------------MEC_ATOMIC_MEM-------------------- +enum MEC_ATOMIC_MEM_command_enum { + command__mec_atomic_mem__single_pass_atomic = 0, + command__mec_atomic_mem__loop_until_compare_satisfied = 1, + command__mec_atomic_mem__wait_for_write_confirmation = 2, + command__mec_atomic_mem__send_and_continue = 3, +}; + +enum MEC_ATOMIC_MEM_cache_policy_enum { + cache_policy__mec_atomic_mem__lru = 0, + cache_policy__mec_atomic_mem__stream = 1, + cache_policy__mec_atomic_mem__noa = 2, + cache_policy__mec_atomic_mem__bypass = 3, +}; + +enum TC_OP { + TC_OP_READ = 0x00000000, + TC_OP_ATOMIC_FCMPSWAP_RTN_32 = 0x00000001, + TC_OP_ATOMIC_FMIN_RTN_32 = 0x00000002, + TC_OP_ATOMIC_FMAX_RTN_32 = 0x00000003, + TC_OP_RESERVED_FOP_RTN_32_0 = 0x00000004, + TC_OP_RESERVED_FOP_RTN_32_2 = 0x00000006, + TC_OP_ATOMIC_SWAP_RTN_32 = 0x00000007, + TC_OP_ATOMIC_CMPSWAP_RTN_32 = 0x00000008, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_RTN_32 = 0x00000009, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_RTN_32 = 0x0000000a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_RTN_32 = 0x0000000b, + TC_OP_PROBE_FILTER = 0x0000000c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_32_2 = 0x0000000e, + TC_OP_ATOMIC_ADD_RTN_32 = 0x0000000f, + TC_OP_ATOMIC_SUB_RTN_32 = 0x00000010, + TC_OP_ATOMIC_SMIN_RTN_32 = 0x00000011, + TC_OP_ATOMIC_UMIN_RTN_32 = 0x00000012, + TC_OP_ATOMIC_SMAX_RTN_32 = 0x00000013, + TC_OP_ATOMIC_UMAX_RTN_32 = 0x00000014, + TC_OP_ATOMIC_AND_RTN_32 = 0x00000015, + TC_OP_ATOMIC_OR_RTN_32 = 0x00000016, + TC_OP_ATOMIC_XOR_RTN_32 = 0x00000017, + TC_OP_ATOMIC_INC_RTN_32 = 0x00000018, + TC_OP_ATOMIC_DEC_RTN_32 = 0x00000019, + TC_OP_WBINVL1_VOL = 0x0000001a, + TC_OP_WBINVL1_SD = 0x0000001b, + TC_OP_RESERVED_NON_FLOAT_RTN_32_0 = 0x0000001c, + TC_OP_RESERVED_NON_FLOAT_RTN_32_1 = 0x0000001d, + TC_OP_RESERVED_NON_FLOAT_RTN_32_2 = 0x0000001e, + TC_OP_RESERVED_NON_FLOAT_RTN_32_3 = 0x0000001f, + TC_OP_WRITE = 0x00000020, + TC_OP_ATOMIC_FCMPSWAP_RTN_64 = 0x00000021, + TC_OP_ATOMIC_FMIN_RTN_64 = 0x00000022, + TC_OP_ATOMIC_FMAX_RTN_64 = 0x00000023, + TC_OP_RESERVED_FOP_RTN_64_0 = 0x00000024, + TC_OP_RESERVED_FOP_RTN_64_1 = 0x00000025, + TC_OP_RESERVED_FOP_RTN_64_2 = 0x00000026, + TC_OP_ATOMIC_SWAP_RTN_64 = 0x00000027, + TC_OP_ATOMIC_CMPSWAP_RTN_64 = 0x00000028, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_RTN_64 = 0x00000029, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_RTN_64 = 0x0000002a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_RTN_64 = 0x0000002b, + TC_OP_WBINVL2_SD = 0x0000002c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_64_0 = 0x0000002d, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_64_1 = 0x0000002e, + TC_OP_ATOMIC_ADD_RTN_64 = 0x0000002f, + TC_OP_ATOMIC_SUB_RTN_64 = 0x00000030, + TC_OP_ATOMIC_SMIN_RTN_64 = 0x00000031, + TC_OP_ATOMIC_UMIN_RTN_64 = 0x00000032, + TC_OP_ATOMIC_SMAX_RTN_64 = 0x00000033, + TC_OP_ATOMIC_UMAX_RTN_64 = 0x00000034, + TC_OP_ATOMIC_AND_RTN_64 = 0x00000035, + TC_OP_ATOMIC_OR_RTN_64 = 0x00000036, + TC_OP_ATOMIC_XOR_RTN_64 = 0x00000037, + TC_OP_ATOMIC_INC_RTN_64 = 0x00000038, + TC_OP_ATOMIC_DEC_RTN_64 = 0x00000039, + TC_OP_WBL2_NC = 0x0000003a, + TC_OP_WBL2_WC = 0x0000003b, + TC_OP_RESERVED_NON_FLOAT_RTN_64_1 = 0x0000003c, + TC_OP_RESERVED_NON_FLOAT_RTN_64_2 = 0x0000003d, + TC_OP_RESERVED_NON_FLOAT_RTN_64_3 = 0x0000003e, + TC_OP_RESERVED_NON_FLOAT_RTN_64_4 = 0x0000003f, + TC_OP_WBINVL1 = 0x00000040, + TC_OP_ATOMIC_FCMPSWAP_32 = 0x00000041, + TC_OP_ATOMIC_FMIN_32 = 0x00000042, + TC_OP_ATOMIC_FMAX_32 = 0x00000043, + TC_OP_RESERVED_FOP_32_0 = 0x00000044, + TC_OP_RESERVED_FOP_32_2 = 0x00000046, + TC_OP_ATOMIC_SWAP_32 = 0x00000047, + TC_OP_ATOMIC_CMPSWAP_32 = 0x00000048, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_32 = 0x00000049, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_32 = 0x0000004a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_32 = 0x0000004b, + TC_OP_INV_METADATA = 0x0000004c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_32_2 = 0x0000004e, + TC_OP_ATOMIC_ADD_32 = 0x0000004f, + TC_OP_ATOMIC_SUB_32 = 0x00000050, + TC_OP_ATOMIC_SMIN_32 = 0x00000051, + TC_OP_ATOMIC_UMIN_32 = 0x00000052, + TC_OP_ATOMIC_SMAX_32 = 0x00000053, + TC_OP_ATOMIC_UMAX_32 = 0x00000054, + TC_OP_ATOMIC_AND_32 = 0x00000055, + TC_OP_ATOMIC_OR_32 = 0x00000056, + TC_OP_ATOMIC_XOR_32 = 0x00000057, + TC_OP_ATOMIC_INC_32 = 0x00000058, + TC_OP_ATOMIC_DEC_32 = 0x00000059, + TC_OP_INVL2_NC = 0x0000005a, + TC_OP_NOP_RTN0 = 0x0000005b, + TC_OP_RESERVED_NON_FLOAT_32_1 = 0x0000005c, + TC_OP_RESERVED_NON_FLOAT_32_2 = 0x0000005d, + TC_OP_RESERVED_NON_FLOAT_32_3 = 0x0000005e, + TC_OP_RESERVED_NON_FLOAT_32_4 = 0x0000005f, + TC_OP_WBINVL2 = 0x00000060, + TC_OP_ATOMIC_FCMPSWAP_64 = 0x00000061, + TC_OP_ATOMIC_FMIN_64 = 0x00000062, + TC_OP_ATOMIC_FMAX_64 = 0x00000063, + TC_OP_RESERVED_FOP_64_0 = 0x00000064, + TC_OP_RESERVED_FOP_64_1 = 0x00000065, + TC_OP_RESERVED_FOP_64_2 = 0x00000066, + TC_OP_ATOMIC_SWAP_64 = 0x00000067, + TC_OP_ATOMIC_CMPSWAP_64 = 0x00000068, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_64 = 0x00000069, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_64 = 0x0000006a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_64 = 0x0000006b, + TC_OP_RESERVED_FOP_FLUSH_DENORM_64_0 = 0x0000006c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_64_1 = 0x0000006d, + TC_OP_RESERVED_FOP_FLUSH_DENORM_64_2 = 0x0000006e, + TC_OP_ATOMIC_ADD_64 = 0x0000006f, + TC_OP_ATOMIC_SUB_64 = 0x00000070, + TC_OP_ATOMIC_SMIN_64 = 0x00000071, + TC_OP_ATOMIC_UMIN_64 = 0x00000072, + TC_OP_ATOMIC_SMAX_64 = 0x00000073, + TC_OP_ATOMIC_UMAX_64 = 0x00000074, + TC_OP_ATOMIC_AND_64 = 0x00000075, + TC_OP_ATOMIC_OR_64 = 0x00000076, + TC_OP_ATOMIC_XOR_64 = 0x00000077, + TC_OP_ATOMIC_INC_64 = 0x00000078, + TC_OP_ATOMIC_DEC_64 = 0x00000079, + TC_OP_WBINVL2_NC = 0x0000007a, + TC_OP_NOP_ACK = 0x0000007b, + TC_OP_RESERVED_NON_FLOAT_64_1 = 0x0000007c, + TC_OP_RESERVED_NON_FLOAT_64_2 = 0x0000007d, + TC_OP_RESERVED_NON_FLOAT_64_3 = 0x0000007e, + TC_OP_RESERVED_NON_FLOAT_64_4 = 0x0000007f, + TC_OP_RESERVED_FOP_RTN_32_1__GFX09_10 = 0x00000005, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_32_1__GFX09_10 = 0x0000000d, + TC_OP_RESERVED_FOP_32_1__GFX09_10 = 0x00000045, + TC_OP_RESERVED_FOP_FLUSH_DENORM_32_1__GFX09_10 = 0x0000004d, + TC_OP_RESERVED_FADD_RTN_32__GFX11 = 0x00000005, + TC_OP_ATOMIC_FADD_FLUSH_DENORM_RTN_32__GFX11 = 0x0000000d, + TC_OP_RESERVED_FADD_32__GFX11 = 0x00000045, + TC_OP_ATOMIC_FADD_FLUSH_DENORM_32__GFX11 = 0x0000004d, +}; + +// Desc: Strucuture used to perform various atomic +// operations - add, subtract, increment, etc +struct AtomicTemplate { + PM4_MEC_ATOMIC_MEM atomic; +}; + +/// @brief PM4 command to write a 64-bit value into a memory +/// location accessible to Gpu +struct WriteDataTemplate { + PM4_MEC_WRITE_DATA write_data; +}; + +// ---------------------------------- MEC_COPY_DATA_src_sel_enum ---------------------------------- +enum MEC_COPY_DATA_src_sel_enum { + src_sel__mec_copy_data__mem_mapped_register = 0, + src_sel__mec_copy_data__tc_l2_obsolete = 1, + src_sel__mec_copy_data__tc_l2 = 2, + src_sel__mec_copy_data__gds = 3, + src_sel__mec_copy_data__perfcounters = 4, + src_sel__mec_copy_data__immediate_data = 5, + src_sel__mec_copy_data__atomic_return_data = 6, + src_sel__mec_copy_data__gds_atomic_return_data0 = 7, + src_sel__mec_copy_data__gds_atomic_return_data1 = 8, + src_sel__mec_copy_data__gpu_clock_count = 9, + src_sel__mec_copy_data__system_clock_count = 10, + src_sel__mec_copy_data__ext32perfcntr = 11, +}; + +// ---------------------------------- MEC_COPY_DATA_dst_sel_enum ---------------------------------- +enum MEC_COPY_DATA_dst_sel_enum { + dst_sel__mec_copy_data__mem_mapped_register = 0, + dst_sel__mec_copy_data__tc_l2 = 2, + dst_sel__mec_copy_data__gds = 3, + dst_sel__mec_copy_data__perfcounters = 4, + dst_sel__mec_copy_data__tc_l2_obsolete = 5, + dst_sel__mec_copy_data__mem_mapped_reg_dc = 6, + dst_sel__mec_copy_data__ext32perfcntr = 11, +}; + +// ------------------------------ MEC_COPY_DATA_src_cache_policy_enum ------------------------------ +enum MEC_COPY_DATA_src_cache_policy_enum { + src_cache_policy__mec_copy_data__lru = 0, + src_cache_policy__mec_copy_data__stream = 1, + src_cache_policy__mec_copy_data__noa = 2, + src_cache_policy__mec_copy_data__bypass = 3, +}; + +// --------------------------------- MEC_COPY_DATA_count_sel_enum --------------------------------- +enum MEC_COPY_DATA_count_sel_enum { + count_sel__mec_copy_data__32_bits_of_data = 0, + count_sel__mec_copy_data__64_bits_of_data = 1, +}; + +// --------------------------------- MEC_COPY_DATA_wr_confirm_enum --------------------------------- +enum MEC_COPY_DATA_wr_confirm_enum { + wr_confirm__mec_copy_data__do_not_wait_for_confirmation = 0, + wr_confirm__mec_copy_data__wait_for_confirmation = 1, +}; + +// ------------------------------ MEC_COPY_DATA_dst_cache_policy_enum ------------------------------ +enum MEC_COPY_DATA_dst_cache_policy_enum { + dst_cache_policy__mec_copy_data__lru = 0, + dst_cache_policy__mec_copy_data__stream = 1, + dst_cache_policy__mec_copy_data__noa = 2, + dst_cache_policy__mec_copy_data__bypass = 3, +}; + +// ------------------------------- MEC_COPY_DATA_pq_exe_status_enum ------------------------------- +enum MEC_COPY_DATA_pq_exe_status_enum { + pq_exe_status__mec_copy_data__default = 0, + pq_exe_status__mec_copy_data__phase_update = 1, +}; + +// ------------------------------- MEC_WRITE_DATA_dst_sel_enum ------------------------------- +enum MEC_WRITE_DATA_dst_sel_enum { + dst_sel__mec_write_data__mem_mapped_register = 0, + dst_sel__mec_write_data__tc_l2 = 2, + dst_sel__mec_write_data__gds = 3, + dst_sel__mec_write_data__memory = 5, + dst_sel__mec_write_data__memory_mapped_adc_persistent_state = 6 }; + +// ------------------------------- MEC_WRITE_DATA_addr_incr_enum ------------------------------- +enum MEC_WRITE_DATA_addr_incr_enum { + addr_incr__mec_write_data__increment_address = 0, + addr_incr__mec_write_data__do_not_increment_address = 1 }; + +// ------------------------------- MEC_WRITE_DATA_wr_confirm_enum ------------------------------- +enum MEC_WRITE_DATA_wr_confirm_enum { + wr_confirm__mec_write_data__do_not_wait_for_write_confirmation = 0, + wr_confirm__mec_write_data__wait_for_write_confirmation = 1 }; + +// ------------------------------- MEC_WRITE_DATA_cache_policy_enum ------------------------------- +enum MEC_WRITE_DATA_cache_policy_enum { + cache_policy__mec_write_data__lru = 0, + cache_policy__mec_write_data__stream = 1, + cache_policy__mec_write_data__noa = 2, + cache_policy__mec_write_data__bypass = 3 }; + +typedef struct PM4_MEC_COPY_DATA { + union { + PM4_MEC_TYPE_3_HEADER header; /// header + uint32_t ordinal1; + }; + union { + struct { + uint32_t src_sel : 4; + uint32_t reserved1 : 4; + uint32_t dst_sel : 4; + uint32_t reserved2 : 1; + uint32_t src_cache_policy : 2; + uint32_t reserved3 : 1; + uint32_t count_sel : 1; + uint32_t reserved4 : 3; + uint32_t wr_confirm : 1; + uint32_t reserved5 : 4; + uint32_t dst_cache_policy : 2; + uint32_t reserved6 : 2; + uint32_t pq_exe_status : 1; + uint32_t reserved7 : 2; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t src_reg_offset : 18; + uint32_t reserved8 : 14; + } bitfields3a; + struct { + uint32_t reserved9 : 2; + uint32_t src_32b_addr_lo : 30; + } bitfields3b; + struct { + uint32_t reserved10 : 3; + uint32_t src_64b_addr_lo : 29; + } bitfields3c; + struct { + uint32_t src_gds_addr_lo : 16; + uint32_t reserved11 : 16; + } bitfields3d; + uint32_t imm_data; + uint32_t ordinal3; + }; + union { + uint32_t src_memtc_addr_hi; + uint32_t src_imm_data; + uint32_t ordinal4; + }; + union { + struct { + uint32_t dst_reg_offset : 18; + uint32_t reserved12 : 14; + } bitfields5a; + struct { + uint32_t reserved13 : 2; + uint32_t dst_32b_addr_lo : 30; + } bitfields5b; + struct { + uint32_t reserved14 : 3; + uint32_t dst_64b_addr_lo : 29; + } bitfields5c; + struct { + uint32_t dst_gds_addr_lo : 16; + uint32_t reserved15 : 16; + } bitfields5d; + uint32_t ordinal5; + }; + uint32_t dst_addr_hi; +} PM4MEC_COPY_DATA; +namespace gfx9 { + +struct PM4_MEC_ACQUIRE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t coher_cntl:31; + uint32_t reserved1:1; + } bitfields2; + uint32_t ordinal2; + }; + uint32_t coher_size; + union { + struct { + uint32_t coher_size_hi:8; + uint32_t reserved2:24; + } bitfields4; + uint32_t ordinal4; + }; + uint32_t coher_base_lo; + union { + struct { + uint32_t coher_base_hi:24; + uint32_t reserved3:8; + } bitfields6; + uint32_t ordinal6; + }; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved4:16; + } bitfields7; + uint32_t ordinal7; + }; +}; + +struct PM4_MEC_RELEASE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t tcl1_vol_action_ena:1; + uint32_t tc_vol_action_ena:1; + uint32_t reserved2:1; + uint32_t tc_wb_action_ena:1; + uint32_t tcl1_action_ena:1; + uint32_t tc_action_ena:1; + uint32_t reserved3:1; + uint32_t tc_nc_action_ena:1; + uint32_t tc_wc_action_ena:1; + uint32_t tc_md_action_ena:1; + uint32_t reserved4:3; + uint32_t cache_policy:2; + uint32_t reserved5:2; + uint32_t pq_exe_status:1; + uint32_t reserved6:2; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved7:16; + uint32_t dst_sel:2; + uint32_t reserved8:6; + uint32_t int_sel:3; + uint32_t reserved9:2; + uint32_t data_sel:3; + } bitfields3; + uint32_t ordinal3; + }; + union { + struct { + uint32_t reserved10:2; + uint32_t address_lo_32b:30; + } bitfields4a; + struct { + uint32_t reserved11:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved12; + uint32_t ordinal4; + }; + union { + uint32_t address_hi; + uint32_t reserved13; + uint32_t ordinal5; + }; + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved14; + uint32_t ordinal6; + }; + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved15; + uint32_t reserved16; + uint32_t ordinal7; + }; + uint32_t int_ctxid; +}; + +struct PM4_MEC_WAIT_REG_MEM64 { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t function:3; + uint32_t reserved1:1; + uint32_t mem_space:2; + uint32_t operation:2; + uint32_t reserved2:24; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved3:3; + uint32_t mem_poll_addr_lo:29; + } bitfields3a; + struct { + uint32_t reg_poll_addr:18; + uint32_t reserved4:14; + } bitfields3b; + struct { + uint32_t reg_write_addr1:18; + uint32_t reserved5:14; + } bitfields3c; + uint32_t ordinal3; + }; + union { + uint32_t mem_poll_addr_hi; + struct { + uint32_t reg_write_addr2:18; + uint32_t reserved6:14; + } bitfields4b; + uint32_t ordinal4; + }; + uint32_t reference; + uint32_t reference_hi; + uint32_t mask; + uint32_t mask_hi; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved7:16; + } bitfields9; + uint32_t ordinal9; + }; +}; + +/// @brief Structure used to configure the flushing of +/// various caches - instruction, constants, L1 and L2 +struct AcquireMemTemplate { + PM4_MEC_ACQUIRE_MEM acquire_mem; +}; + +struct EndofKernelNotifyTemplate { + PM4_MEC_RELEASE_MEM release_mem; +}; + +/// @brief PM4 command to wait for a certain event before proceeding +/// to process another command on the queue +struct WaitRegMem64Template { + PM4_MEC_WAIT_REG_MEM64 wait_reg_mem; +}; + +} // gfx9 namespace + +namespace gfx10 { + +struct PM4_MEC_ACQUIRE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + uint32_t reserved1; + uint32_t coher_size; + union { + struct { + uint32_t coher_size_hi:8; + uint32_t reserved2:24; + } bitfields4; + uint32_t ordinal4; + }; + uint32_t coher_base_lo; + union { + struct { + uint32_t coher_base_hi:24; + uint32_t reserved3:8; + } bitfields6; + uint32_t ordinal6; + }; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved4:16; + } bitfields7; + uint32_t ordinal7; + }; + union { + struct { + uint32_t gcr_cntl:19; + uint32_t reserved4:13; + } bitfields8; + uint32_t ordinal8; + }; +}; + +struct PM4_MEC_RELEASE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t gcr_cntl:12; + uint32_t reserved2:1; + uint32_t cache_policy:2; + uint32_t reserved3:2; + uint32_t pq_exe_status:1; + uint32_t reserved4:2; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved7:16; + uint32_t dst_sel:2; + uint32_t reserved8:2; + uint32_t mes_intr_pipe:2; + uint32_t mes_action_id:2; + uint32_t int_sel:3; + uint32_t reserved9:2; + uint32_t data_sel:3; + } bitfields3; + uint32_t ordinal3; + }; + union { + struct { + uint32_t reserved10:2; + uint32_t address_lo_32b:30; + } bitfields4a; + struct { + uint32_t reserved11:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved12; + uint32_t ordinal4; + }; + union { + uint32_t address_hi; + uint32_t reserved13; + uint32_t ordinal5; + }; + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved14; + uint32_t ordinal6; + }; + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved15; + uint32_t reserved16; + uint32_t ordinal7; + }; + uint32_t int_ctxid; +}; + +struct PM4_MEC_WAIT_REG_MEM64 { + union { + PM4_MEC_TYPE_3_HEADER header; ///header + uint32_t ordinal1; + }; + union { + struct { + uint32_t function:3; + uint32_t reserved1:1; + uint32_t mem_space:2; + uint32_t operation:2; + uint32_t reserved2:14; + uint32_t mes_intr_pipe:2; + uint32_t mes_action:1; + uint32_t cache_policy:2; + uint32_t reserved3:5; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved4:3; + uint32_t mem_poll_addr_lo:29; + } bitfields3a; + struct { + uint32_t reg_poll_addr:18; + uint32_t reserved5:14; + } bitfields3b; + struct { + uint32_t reg_write_addr1:18; + uint32_t reserved6:14; + } bitfields3c; + uint32_t ordinal3; + }; + union { + uint32_t mem_poll_addr_hi; + struct { + uint32_t reg_write_addr2:18; + uint32_t reserved7:14; + } bitfields4b; + uint32_t ordinal4; + }; + uint32_t reference; + uint32_t reference_hi; + uint32_t mask; + uint32_t mask_hi; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved8:15; + uint32_t optimize_ace_offload_mode:1; + } bitfields9; + uint32_t ordinal9; + }; +}; + +/// @brief Structure used to configure the flushing of +/// various caches - instruction, constants, L1 and L2 +struct AcquireMemTemplate { + PM4_MEC_ACQUIRE_MEM acquire_mem; +}; + +struct EndofKernelNotifyTemplate { + PM4_MEC_RELEASE_MEM release_mem; +}; + +struct WaitRegMem64Template { + PM4_MEC_WAIT_REG_MEM64 wait_reg_mem; +}; + +} // gfx10 namespace + +namespace gfx11 { + +struct PM4_MEC_RELEASE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t gcr_cntl:13; + uint32_t cache_policy:2; + uint32_t reserved2:1; + uint32_t pq_exe_status:1; + uint32_t reserved3:1; + uint32_t glk_inv:1; + uint32_t reserved4:1; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved5:16; + uint32_t dst_sel:2; + uint32_t reserved6:2; + uint32_t mes_intr_pipe:2; + uint32_t mes_action_id:2; + uint32_t int_sel:3; + uint32_t reserved7:2; + uint32_t data_sel:3; + } bitfields3; + uint32_t ordinal3; + }; + union { + struct { + uint32_t reserved8:2; + uint32_t address_lo_32b:30; + } bitfields4a; + struct { + uint32_t reserved9:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved10; + uint32_t ordinal4; + }; + union { + uint32_t address_hi; + uint32_t reserved11; + uint32_t ordinal5; + }; + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved12; + uint32_t ordinal6; + }; + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved13; + uint32_t reserved14; + uint32_t ordinal7; + }; + uint32_t int_ctxid; +}; + +struct EndofKernelNotifyTemplate { + PM4_MEC_RELEASE_MEM release_mem; +}; + +} // gfx11 namespace + +#endif diff --git a/registers.h b/registers.h new file mode 100644 index 0000000000..4d430b41e4 --- /dev/null +++ b/registers.h @@ -0,0 +1,363 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// This file is used only for open source cmake builds, if we hardcode the +// register values in amd_aql_queue.cpp then this file won't be required. For +// now we are using this file where register details are spelled out in the +// structs/unions below. +#ifndef _WSL_INC_REGISTERS_H_ +#define _WSL_INC_REGISTERS_H_ + +typedef enum SQ_RSRC_BUF_TYPE { +SQ_RSRC_BUF = 0x00000000, +SQ_RSRC_BUF_RSVD_1 = 0x00000001, +SQ_RSRC_BUF_RSVD_2 = 0x00000002, +SQ_RSRC_BUF_RSVD_3 = 0x00000003, +} SQ_RSRC_BUF_TYPE; + +typedef enum BUF_DATA_FORMAT { +BUF_DATA_FORMAT_INVALID = 0x00000000, +BUF_DATA_FORMAT_8 = 0x00000001, +BUF_DATA_FORMAT_16 = 0x00000002, +BUF_DATA_FORMAT_8_8 = 0x00000003, +BUF_DATA_FORMAT_32 = 0x00000004, +BUF_DATA_FORMAT_16_16 = 0x00000005, +BUF_DATA_FORMAT_10_11_11 = 0x00000006, +BUF_DATA_FORMAT_11_11_10 = 0x00000007, +BUF_DATA_FORMAT_10_10_10_2 = 0x00000008, +BUF_DATA_FORMAT_2_10_10_10 = 0x00000009, +BUF_DATA_FORMAT_8_8_8_8 = 0x0000000a, +BUF_DATA_FORMAT_32_32 = 0x0000000b, +BUF_DATA_FORMAT_16_16_16_16 = 0x0000000c, +BUF_DATA_FORMAT_32_32_32 = 0x0000000d, +BUF_DATA_FORMAT_32_32_32_32 = 0x0000000e, +BUF_DATA_FORMAT_RESERVED_15 = 0x0000000f, +} BUF_DATA_FORMAT; + +typedef enum BUF_NUM_FORMAT { +BUF_NUM_FORMAT_UNORM = 0x00000000, +BUF_NUM_FORMAT_SNORM = 0x00000001, +BUF_NUM_FORMAT_USCALED = 0x00000002, +BUF_NUM_FORMAT_SSCALED = 0x00000003, +BUF_NUM_FORMAT_UINT = 0x00000004, +BUF_NUM_FORMAT_SINT = 0x00000005, +BUF_NUM_FORMAT_SNORM_OGL__SI__CI = 0x00000006, +BUF_NUM_FORMAT_RESERVED_6__VI = 0x00000006, +BUF_NUM_FORMAT_FLOAT = 0x00000007, +} BUF_NUM_FORMAT; + +typedef enum BUF_FORMAT { +BUF_FORMAT_32_UINT = 0x00000014, +} BUF_FORMAT; + +typedef enum SQ_SEL_XYZW01 { +SQ_SEL_0 = 0x00000000, +SQ_SEL_1 = 0x00000001, +SQ_SEL_RESERVED_0 = 0x00000002, +SQ_SEL_RESERVED_1 = 0x00000003, +SQ_SEL_X = 0x00000004, +SQ_SEL_Y = 0x00000005, +SQ_SEL_Z = 0x00000006, +SQ_SEL_W = 0x00000007, +} SQ_SEL_XYZW01; + + union COMPUTE_TMPRING_SIZE { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 13; + unsigned int : 7; +#elif defined(BIGENDIAN_CPU) + unsigned int : 7; + unsigned int WAVESIZE : 13; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union COMPUTE_TMPRING_SIZE_GFX11 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 15; + unsigned int : 5; +#elif defined(BIGENDIAN_CPU) + unsigned int : 5; + unsigned int WAVESIZE : 15; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union COMPUTE_TMPRING_SIZE_GFX12 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 18; + unsigned int : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int : 2; + unsigned int WAVESIZE : 18; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union SQ_BUF_RSRC_WORD0 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS : 32; +#elif defined(BIGENDIAN_CPU) + unsigned int BASE_ADDRESS : 32; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD1 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS_HI : 16; + unsigned int STRIDE : 14; + unsigned int CACHE_SWIZZLE : 1; + unsigned int SWIZZLE_ENABLE : 1; +#elif defined(BIGENDIAN_CPU) + unsigned int SWIZZLE_ENABLE : 1; + unsigned int CACHE_SWIZZLE : 1; + unsigned int STRIDE : 14; + unsigned int BASE_ADDRESS_HI : 16; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union SQ_BUF_RSRC_WORD1_GFX11 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS_HI : 16; + unsigned int STRIDE : 14; + unsigned int SWIZZLE_ENABLE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int SWIZZLE_ENABLE : 2; + unsigned int STRIDE : 14; + unsigned int BASE_ADDRESS_HI : 16; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD2 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int NUM_RECORDS : 32; +#elif defined(BIGENDIAN_CPU) + unsigned int NUM_RECORDS : 32; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD3 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int NUM_FORMAT : 3; + unsigned int DATA_FORMAT : 4; + unsigned int ELEMENT_SIZE : 2; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int ATC__CI__VI : 1; + unsigned int HASH_ENABLE : 1; + unsigned int HEAP : 1; + unsigned int MTYPE__CI__VI : 3; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int MTYPE__CI__VI : 3; + unsigned int HEAP : 1; + unsigned int HASH_ENABLE : 1; + unsigned int ATC__CI__VI : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int ELEMENT_SIZE : 2; + unsigned int DATA_FORMAT : 4; + unsigned int NUM_FORMAT : 3; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union SQ_BUF_RSRC_WORD3_GFX10 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 7; + unsigned int RESERVED1 : 2; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int RESOURCE_LEVEL : 1; + unsigned int RESERVED2 : 3; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int RESERVED2 : 3; + unsigned int RESOURCE_LEVEL : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 2; + unsigned int FORMAT : 7; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + // From V# Table + union SQ_BUF_RSRC_WORD3_GFX11 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 6; + unsigned int RESERVED1 : 3; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int RESERVED2 : 4; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int RESERVED2 : 4; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 3; + unsigned int FORMAT : 6; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + // From V# Table + union SQ_BUF_RSRC_WORD3_GFX12 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 6; + unsigned int RESERVED1 : 3; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int WRITE_COMPRESS_ENABLE : 1; + unsigned int COMPRESSION_EN : 1; + unsigned int COMPRESSION_ACCESS_MODE : 2; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int COMPRESSION_ACCESS_MODE : 2; + unsigned int COMPRESSION_EN : 1; + unsigned int WRITE_COMPRESS_ENABLE : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 3; + unsigned int FORMAT : 6; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; +#endif // header guard diff --git a/thunk_proxy/thunk_proxy.h b/thunk_proxy/thunk_proxy.h new file mode 100644 index 0000000000..fcdb5ea4de --- /dev/null +++ b/thunk_proxy/thunk_proxy.h @@ -0,0 +1,128 @@ +#ifndef _WSL_INC_THUNK_PROXY_H_ +#define _WSL_INC_THUNK_PROXY_H_ + +#include + +namespace thunk_proxy { +enum AllocDomain { + kSystem, + kLocal, + kUserMemory, + kUserQueue, + kDomainCount, +}; + +enum MemFlag { + kFineGrain = (1ULL << 0), + kKernarg = (1ULL << 1), +}; + +enum EngineFlag { + KCOMPUTE0 = (1ULL << 0), + KDRMDMA = (1ULL << 1), + KDRMDMA1 = (1ULL << 2), +}; + +enum SchedLevel { + kLow = 0, + kNormal = 1, + kHigh = 2, +}; + +enum AsicFamilyType { + kPlumBONITO, + kNavi44, + kNavi48 +}; + +struct HwsInfo { + union { + struct { + uint32_t gfxHwsEnabled : 1; + uint32_t computeHwsEnabled : 1; + uint32_t dmaHwsEnabled : 1; + uint32_t dma1HwsEnabled : 1; + uint32_t reserved : 28; + } hwsMask; + uint32_t osHwsEnableFlags; + }; + uint64_t engineOrdinalMask; // Indicates which engines (by ordinal) support MES HWS +}; + +typedef struct { + int major; + int minor; + int stepping; + bool is_dgpu; + char product_name[MAX_PATH]; + const char *uuid; + AsicFamilyType family; + uint32_t device_id; + uint32_t wavefront_size; + uint32_t compute_unit_count; + uint32_t max_engine_clock_mhz; + uint32_t watch_points_num; + uint32_t pci_bus_addr; + uint32_t memory_bus_width; + uint32_t max_memory_clock_mhz; + uint64_t gpu_counter_frequency; + uint32_t wave_per_cu; + uint32_t simd_per_cu; + uint32_t max_scratch_slots_per_cu; + uint32_t num_shader_engine; + uint32_t shader_array_per_shader_engine; + uint32_t domain; + uint32_t num_gws; + uint32_t asic_revision; + uint64_t local_visible_heap_size; + uint64_t local_invisible_heap_size; + uint64_t private_aperture_base; + uint64_t private_aperture_size; + uint64_t shared_aperture_base; + uint64_t shared_aperture_size; + uint32_t user_queue_size; + uint32_t lds_size; + uint32_t big_page_alignment_size; + uint32_t hw_big_page_min_alignment_size; + uint32_t hw_big_page_alignment_size; + bool enable_big_page_alignment; + uint32_t mec_fw_version; + uint32_t sdma_fw_version; + uint32_t l1_cache_size; + uint32_t l2_cache_size; + uint32_t l3_cache_size; + uint32_t gl2_cacheline_size; + uint32_t num_cp_queues; + HwsInfo hwsInfo; + std::vector sdma_schedid; + uint32_t compute_schedid; + bool state_shadowing_by_cpfw; + bool platform_atomic_support; + void *adapter_info; + void *adapter_ex_info; +} DeviceInfo; + +int EngineOrdinal(int engine, DeviceInfo *device_info); +bool GetHwsEnabled(int engine, DeviceInfo *device_info); +bool ShouldDisableGpuTimeout(int engine, DeviceInfo *device_info); +bool ParseAdapterInfo(D3DKMT_HANDLE adapter, DeviceInfo *device_info); +bool QueryAdapterSupported(D3DKMT_HANDLE adapter); + +uint32_t QueueEngine2EngineFlag(uint32_t queue_engine); +void SetAllocationInfo(void *data, uint64_t size, AllocDomain domain, + uint64_t addr, uint32_t mem_flags, uint32_t engine_flag, const DeviceInfo &device_info); +bool CreatePrivateAllocInfo(int num_handles, void **ppdrv_priv, void **ppalloc_priv, + int *pdrv_priv_data_size, int *palloc_priv_data_size); +void DestroyPrivateAllocInfo(void *drv_priv, void *alloc_priv); + +int CreateSubmitPrivData(void **priv_data, D3DKMT_HANDLE queue, uint64_t command_addr, + uint64_t command_size, bool is_hw_queue); +int CreateHwQueuePrivData(void **priv_data, D3DKMT_HANDLE context, + bool FwManagedGfxState, SchedLevel level = kNormal); +int CreateContextPrivData(void **priv_data, bool FwManagedGfxState); +int CreatePowerOptPrivData(void **priv_data, bool restore); +int CreateCalibratedTimestampsPrivData(void **priv_data); +void QueryCalibratedTimestamps(void* priv, uint64_t* gpu, uint64_t* cpu); +void DestroyPrivData(void *priv_data); +} +#endif diff --git a/thunk_proxy/wddm_types.h b/thunk_proxy/wddm_types.h new file mode 100644 index 0000000000..83422a83d3 --- /dev/null +++ b/thunk_proxy/wddm_types.h @@ -0,0 +1,155 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_THUNK_PROXY_WDDM_TYPES_H_ +#define _WSL_INC_THUNK_PROXY_WDDM_TYPES_H_ + +#include + +#include + +typedef uint32_t UINT, *UINT_PTR; +typedef int32_t INT32; +typedef int32_t LONG; +typedef uint32_t ULONG, *ULONG_PTR; +typedef int64_t LONGLONG; +typedef int64_t LONG64; +typedef uint64_t ULONGLONG; +typedef uint64_t ULONG64, *ULONG64_PTR; +typedef uint8_t BYTE; +typedef uint16_t WORD; +typedef uint32_t DWORD; +typedef int32_t BOOL; +typedef int32_t NTSTATUS; +typedef uint16_t USHORT; +typedef uint16_t UINT16; +typedef uint32_t UINT32; +typedef uint64_t UINT64; +typedef int32_t INT; +typedef uint64_t SIZE_T; +typedef void VOID; +typedef float FLOAT; +typedef char CHAR; +typedef unsigned char UCHAR; +typedef UCHAR BOOLEAN; +typedef int16_t WCHAR; +typedef void *HANDLE; +typedef void *PVOID; +typedef void *LPVOID; +typedef const int16_t *PCWSTR; + +#define ULONG ULONG +#define ULONG_PTR ULONG_PTR +#define USHORT USHORT + +#define DECLARE_HANDLE(name) struct name##__{int unused;}; typedef struct name##__ *name +#define C_ASSERT(e) typedef char __C_ASSERT__[(e)?1:-1] + +DECLARE_HANDLE(HWND); +DECLARE_HANDLE(HDC); +DECLARE_HANDLE(PALETTEENTRY); + +typedef struct tagPOINT { + LONG x; + LONG y; +} POINT; + +typedef struct tagRECT { + LONG left; + LONG top; + LONG right; + LONG bottom; +} RECT; + +typedef struct tagRECTL { + LONG left; + LONG top; + LONG right; + LONG bottom; +} RECTL; + +typedef union _LARGE_INTEGER { + struct { + DWORD LowPart; + DWORD HighPart; + } u; + LONGLONG QuadPart; +} LARGE_INTEGER; + +typedef LARGE_INTEGER *PLARGE_INTEGER; + +typedef struct _LUID { + ULONG LowPart; + LONG HighPart; +} LUID, *PLUID; + +typedef enum _DEVICE_POWER_STATE { + PowerDeviceUnspecified = 0, + PowerDeviceD0, + PowerDeviceD1, + PowerDeviceD2, + PowerDeviceD3, + PowerDeviceMaximum +} DEVICE_POWER_STATE, *PDEVICE_POWER_STATE; + +#define _Check_return_ +#define APIENTRY +#define CONST const +#define IN +#define OUT +#define FAR +#define MAX_PATH 260 +#define __stdcall + +#ifndef GUID_DEFINED +#define GUID_DEFINED +typedef struct _GUID { + uint32_t Data1; + uint16_t Data2; + uint16_t Data3; + uint8_t Data4[ 8 ]; +} GUID; +#endif + +#include + +#endif diff --git a/wddm/cmd_util.h b/wddm/cmd_util.h new file mode 100644 index 0000000000..f69d45242a --- /dev/null +++ b/wddm/cmd_util.h @@ -0,0 +1,83 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */ + +#ifndef _WSL_INC_WDDM_CMD_UTIL_H_ +#define _WSL_INC_WDDM_CMD_UTIL_H_ + +#include +#include "hsa-runtime/inc/hsa.h" +#include "hsa-runtime/inc/amd_hsa_queue.h" +#include "hsa-runtime/inc/amd_hsa_kernel_code.h" +#include "impl/pm4_cmds.h" +#include "util/utils.h" +#include "libhsakmt.h" + +namespace wsl { +namespace thunk { + +struct DispatchInfo { + uint8_t major; + hsa_kernel_dispatch_packet_t *pPacket; + void *pEntry; + const amd_kernel_code_t *pKernelObject; + uint32_t ldsBlks; + amd_queue_t *pAmdQueue; + bool wave32; + uint32_t srd; + void *pScratchBase; + uint32_t scratchSizePerWave; + uint32_t scratchBaseOffset[2]; + uint32_t offsetCnt; +}; + +class CmdUtil { +public: + CmdUtil() {}; + ~CmdUtil() {}; + + size_t BuildCopyData( + uint64_t *pDstAddr, + void *pBuffer, + uint32_t dstSel = dst_sel__mec_copy_data__tc_l2, + uint32_t dstCachePolicy = dst_cache_policy__mec_copy_data__stream, + uint32_t srcSel = src_sel__mec_copy_data__gpu_clock_count, + uint32_t srcCachePolicy = src_cache_policy__mec_copy_data__lru, + uint32_t countSel = count_sel__mec_copy_data__64_bits_of_data, + uint32_t wrConfirm = wr_confirm__mec_copy_data__wait_for_confirmation); + + size_t BuildBarrier( + void *pBuffer, + uint32_t eventIndex = event_index__mec_event_write__cs_partial_flush, + uint32_t eventType = CS_PARTIAL_FLUSH); + + size_t BuildWriteData64Command( + void *pBuffer, + uint64_t* write_addr, + uint64_t write_value); + + size_t BuildAcquireMem( + uint8_t major, + void *pBuffer); + + size_t BuildScratch( + void *pScratchBase, + void *pBuffer); + + size_t BuildComputeShaderParams( + void *pBuffer); + + size_t BuildDispatch( + struct DispatchInfo *pInfo, + void *pBuffer); + + size_t BuildAtomicMem( + uint64_t *pAddr, + uint32_t atomic, + void *pBuffer, + uint32_t cachePolicy = cache_policy__mec_atomic_mem__stream, + uint64_t srcData = 1); +}; + +} // namespace thunk +} // namespace wsl + +#endif \ No newline at end of file diff --git a/wddm/device.h b/wddm/device.h new file mode 100644 index 0000000000..2aa751b662 --- /dev/null +++ b/wddm/device.h @@ -0,0 +1,257 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_DEVICE_H_ +#define _WSL_INC_WDDM_DEVICE_H_ + +#include +#include + +#include +#include +#include + +#include "impl/wddm/types.h" +#include "impl/thunk_proxy/thunk_proxy.h" +#include "impl/wddm/va_mgr.h" +#include "impl/wddm/status.h" +#include "impl/wddm/types.h" +#include "impl/wddm/gpu_memory.h" +#include "impl/wddm/cmd_util.h" + +namespace wsl { +namespace thunk { + +//class Queue; +class WDDMQueue; + +// WSL2 hyperv GPADL protocol limitation +#define MAX_USERPTR_BLOCK_SIZE 0xf0000000 +#define START_NON_CANONICAL_ADDR (1ULL << 47) +#define END_NON_CANONICAL_ADDR (~0UL - (1UL << 47)) +#define IS_OVERLAPPING(start1, size1, start2, size2) \ + ((start1 < (start2 + size2)) && (start2 < (start1 + size1))) + +class WDDMDevice { +public: + static constexpr size_t GpuMemoryChunkSize = 2 * (1ULL << 30); // 2 GB + + WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid); + ~WDDMDevice(); + + int Major() { return device_info_.major; } + int Minor() { return device_info_.minor; } + int Stepping() { return device_info_.stepping; } + bool IsDgpu() { return device_info_.is_dgpu; } + const char *ProductName() { return device_info_.product_name; } + const char *Uuid() { return device_info_.uuid; } + thunk_proxy::AsicFamilyType GfxFamily() { return device_info_.family; } + uint32_t DeviceId() { return device_info_.device_id; } + uint32_t WavefrontSize() { return device_info_.wavefront_size; } + uint32_t ComputeUnitCount() { return device_info_.compute_unit_count; } + uint32_t MaxEngineClockMhz() { return device_info_.max_engine_clock_mhz; } + uint32_t WatchPointsNum() { return device_info_.watch_points_num; } + uint32_t PciBusAddr() { return device_info_.pci_bus_addr; } + + uint32_t MemoryBusWidth() { return device_info_.memory_bus_width; } + uint32_t MaxMemoryClockMhz() { return device_info_.max_memory_clock_mhz; } + uint32_t WavePerCu() { return device_info_.wave_per_cu; } + uint32_t SimdPerCu() { return device_info_.simd_per_cu; } + uint32_t MaxScratchSlotsPerCu() { return device_info_.max_scratch_slots_per_cu; } + uint32_t NumShaderEngine() { return device_info_.num_shader_engine; } + uint32_t ShaderArrayPerShaderEngine() { return device_info_.shader_array_per_shader_engine; } + uint32_t NumSdmaEngine() { return device_info_.sdma_schedid.size(); } + uint32_t Domain() { return device_info_.domain; } + uint32_t NumGws() { return device_info_.num_gws; } + uint32_t AsicRevision() { return device_info_.asic_revision; } + uint64_t LocalHeapSize() { return device_info_.local_visible_heap_size + device_info_.local_invisible_heap_size; } + uint64_t LocalVisibleHeapSize() { return device_info_.local_visible_heap_size; } + uint64_t LocalInvisibleHeapSize() { return device_info_.local_invisible_heap_size; } + uint64_t PrivateApertureBase() { return device_info_.private_aperture_base; } + uint64_t PrivateApertureSize() { return device_info_.private_aperture_size; } + uint64_t SharedApertureBase() { return device_info_.shared_aperture_base; } + uint64_t SharedApertureSize() { return device_info_.shared_aperture_size; } + uint32_t LdsSize() { return device_info_.lds_size; } + uint64_t GPUCounterFrequency() { return device_info_.gpu_counter_frequency; } + uint32_t GetSwsQueueSize(void) const { return device_info_.user_queue_size; } + uint32_t GetMecFwVersion() { return device_info_.mec_fw_version; } + uint32_t GetSdmaFwVersion() { return device_info_.sdma_fw_version; } + uint32_t GetL1CacheSize() { return device_info_.l1_cache_size; } + uint32_t GetL2CacheSize() { return device_info_.l2_cache_size; } + uint32_t GetL3CacheSize() { return device_info_.l3_cache_size; } + uint32_t Gl2CacheLineSize() { return device_info_.gl2_cacheline_size; } + bool SupportStateShadowingByCpFw(void) const { return device_info_.state_shadowing_by_cpfw; } + bool SupportPlatformAtomic(void) const { return device_info_.platform_atomic_support; } + uint32_t GetSdmaEngine(uint32_t idx) { + assert(idx < NumSdmaEngine()); + return device_info_.sdma_schedid[idx]; + } + uint32_t GetComputeEngine() { return device_info_.compute_schedid; } + + uint64_t VramAvail(); + + void GetClockCounters(uint64_t *gpu, uint64_t *cpu); + uint32_t GetNumCpQueues() { return device_info_.num_cp_queues; } + + bool CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr); + void DestroySyncobj(D3DKMT_HANDLE handle); + + bool CreateQueue(WDDMQueue *queue); + void DestroyQueue(WDDMQueue *queue); + bool CreateHwQueue(WDDMQueue *queue); + bool DestroyHwQueue(WDDMQueue *queue); + bool SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value); + bool SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value); + + bool WaitPagingFence(WDDMQueue *queue) { + uint64_t value = page_fence_value_; + + if (*page_fence_addr_ < value && + !GpuWait(queue, &page_syncobj_, &value, 1)) + return false; + + return true; + } + + bool GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs, + uint64_t *values, int count); + bool GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs, + uint64_t *value, int count); + bool CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value, + int count, bool wait_any); + bool WaitOnPagingFenceFromCpu(); + + uint32_t LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt); + uint32_t GetCmdbufSize(void) const { return cmdbuf_size_; } + uint32_t GetAqlFrameSize(void) const { return cmdbuf_aql_frame_size_; } + static uint32_t GetAqlFrameNum(void) { return cmdbuf_aql_frame_num_; } + + // Both legacy HWS and stage 1 HWS use KMD to alloc use queue memory, + // return false by default + bool AllocUserQueueMemFromUMD(void) const { return false; } + + bool IsHwsEnabled(int engine) { + return thunk_proxy::GetHwsEnabled(engine, &device_info_); + } + + void UpdatePageFence(uint64_t fence_value); + + D3DKMT_HANDLE PagingQueue() const { return page_queue_; } + D3DKMT_HANDLE PagingFence() const { return page_syncobj_; } + D3DKMT_HANDLE DeviceHandle() const { return device_; } + LUID GetLuid() const { return adapter_luid_; } + + const thunk_proxy::DeviceInfo& DeviceInfo() const { return device_info_; } + + ErrorCode ReserveGpuVirtualAddress(thunk_proxy::AllocDomain domain, + gpusize hit_base_addr, + gpusize size, + gpusize *out_gpu_virtual_addr, + gpusize alignment, + bool lock=false); + + ErrorCode FreeGpuVirtualAddress(thunk_proxy::AllocDomain domain, + gpusize base_addr, + gpusize size); + + ErrorCode CreateGpuMemory(const GpuMemoryCreateInfo &create_info, GpuMemory **gpu_mem); + ErrorCode HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr); + void HandleApertureFree(gpusize gpu_addr); + +private: + bool ParseDeviceInfo(void); + void DestroyDeviceInfo(void); + bool CreateDevice(void); + bool DestroyDevice(void); + bool CreatePagingQueue(void); + bool DestroyPagingQueue(void); + void *Lock(D3DKMT_HANDLE handle); + bool Unlock(D3DKMT_HANDLE handle); + bool CreateContext(int engine, D3DKMT_HANDLE *handle); + bool DestroyContext(D3DKMT_HANDLE handle); + + void SetPowerOptimization(bool restore); + void InitCmdbufInfo(void); + bool ReserveSystemHeapSpace(void); + bool FreeSystemHeapSpace(void); + bool ReserveLocalHeapSpace(void); + bool InitHandleApertureSpace(void); + bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock=false); + bool DecommitSystemHeapSpace(void* addr, int64_t size); + bool FreeLocalHeapSpace(void); + void InitVaMgr(); + void InitHandleApertureMgr(); + + D3DKMT_HANDLE adapter_; + LUID adapter_luid_; + D3DKMT_HANDLE device_; + + D3DKMT_HANDLE page_queue_; + D3DKMT_HANDLE page_syncobj_; + uint64_t *page_fence_addr_; + std::atomic page_fence_value_; + + uint64_t handle_aperture_start_; + uint64_t handle_aperture_size_; + uint64_t local_heap_space_start_; + uint64_t local_heap_space_size_; + uint64_t system_heap_space_start_; + uint64_t system_heap_space_size_; + uint32_t cmdbuf_size_; + uint32_t cmdbuf_aql_frame_size_; + static const uint32_t cmdbuf_aql_frame_num_; + // device info + thunk_proxy::DeviceInfo device_info_; + + std::unique_ptr local_va_mgr_; + std::unique_ptr handle_aperture_mgr_; + //CmdUtil cmd_util; +}; + +NTSTATUS WDDMGetAdapters(D3DKMT_ADAPTERINFO *&adapters, int &num_adapters); + +} // namespace thunk +} // namespace wsl + +#endif diff --git a/wddm/gpu_memory.h b/wddm/gpu_memory.h new file mode 100644 index 0000000000..4f5fe7e59c --- /dev/null +++ b/wddm/gpu_memory.h @@ -0,0 +1,227 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_GPU_MEMORY_H_ +#define _WSL_INC_WDDM_GPU_MEMORY_H_ + +#include +#include +#include "util/utils.h" +#include "impl/wddm/types.h" +#include "impl/wddm/thunks.h" +#include "impl/thunk_proxy/thunk_proxy.h" + +namespace wsl { +namespace thunk { + +class WDDMDevice; + +union GpuMemoryCreateFlags { + struct { + uint64_t virtual_alloc : 1; // only allocate virtual address, without physical buffer + uint64_t physical_only : 1; // only allocate physical buffer, without virutal address + uint64_t interprocess : 1; // physical buffer need share info between exporter and importer + uint64_t locked : 1; // lock virtual address space into RAM, preventing that memory from being paged to the swap area + uint64_t physical_contiguous : 1; // contiguous physical pages + uint64_t unused : 59; + }; + uint64_t reserved; +}; + +struct GpuMemoryCreateInfo { + GpuMemoryCreateInfo() { + flags.reserved = 0; + domain = thunk_proxy::kLocal; + size = 0; + alignment = 0; + mem_flags = 0; + engine_flag = 0; + va_hint = 0; + user_ptr = nullptr; + dmabuf_fd = -1; + } + + GpuMemoryCreateFlags flags; + thunk_proxy::AllocDomain domain; + gpusize size; + gpusize alignment; + int mem_flags; + int engine_flag; + int dmabuf_fd; // Import from dmabuf + + void *user_ptr; + gpusize va_hint; +}; + +struct GpuMemoryDesc { + GpuMemoryDesc() { + gpu_addr = 0; + cpu_addr = nullptr; + client_size = 0; + size = alignment = 0; + flags.reserved = 0; + mem_flags = 0; + engine_flag = 0; + handle_ape_addr = 0; + } + + thunk_proxy::AllocDomain domain; + LUID adapter_luid; // Where is the backing store location + gpusize gpu_addr; + void *cpu_addr; + gpusize client_size; // user request size + gpusize size; + gpusize alignment; + gpusize handle_ape_addr; + + union { + struct { + uint32_t is_virtual : 1; + uint32_t is_shared : 1; + uint32_t is_external : 1; + uint32_t is_physical_only : 1; + uint32_t is_locked : 1; + uint32_t is_queue_referenced : 1; + uint32_t is_physical_contiguous : 1; + uint32_t unused : 25; + }; + + uint32_t reserved; + } flags; + + int mem_flags; + int engine_flag; +}; + +struct SharedHandleInfo { + thunk_proxy::AllocDomain domain; + LUID adapter_luid; + gpusize client_size; // user request size + uint64_t size; + uint32_t flags; + int mem_flags; +}; + +using GpuMemoryHandle = void *; + +class GpuMemory { +public: + static size_t CalcChunkNumbers(gpusize size); + + ErrorCode Init(const GpuMemoryCreateInfo &create_info); + + WDDMDevice *GetDevice() const { return device_; } + gpusize Size() const { return desc_.size; } + gpusize ClientSize() const { return desc_.client_size; } + uint64_t GpuAddress() const { return desc_.gpu_addr; } + void *CpuAddress() const { return desc_.cpu_addr; } + uint64_t HandleApeAddress() const { return desc_.handle_ape_addr; } + + inline bool IsLocal() const { return desc_.domain == thunk_proxy::kLocal; } + inline bool IsUserMemory() const { return desc_.domain == thunk_proxy::kUserMemory; } + inline bool IsSystem() const { return desc_.domain == thunk_proxy::kSystem; } + inline bool IsUserQueue() const { return desc_.domain == thunk_proxy::kUserQueue; } + inline bool IsPhysicalOnly() const { return desc_.flags.is_physical_only; } + inline bool IsPhysicalContiguous() const { return desc_.flags.is_physical_contiguous; } + inline bool IsVirtual() const { return desc_.flags.is_virtual; } + inline bool IsShared() const { return desc_.flags.is_shared; } + inline bool IsExternal() const { return desc_.flags.is_external; } + + inline uint32_t Flags() const { return desc_.flags.reserved; } + inline int GetAllocInfo() const { return desc_.mem_flags; } + inline bool IsFineGrain() const { return (desc_.mem_flags & thunk_proxy::kFineGrain); } + inline bool IsSameAdapter(const LUID &luid) const { + return (desc_.adapter_luid.HighPart == luid.HighPart && + desc_.adapter_luid.LowPart == luid.LowPart); + } + inline void GetQueueReference() { desc_.flags.is_queue_referenced = 1; } + inline void PutQueueReference() { desc_.flags.is_queue_referenced = 0; } + inline bool IsQueueReferenced() const { return desc_.flags.is_queue_referenced; } + + WinAllocationHandle GetAllocationHandle(size_t index) const { return alloc_handles_ptr_[index]; } + size_t NumChunks() const { return num_allocations_; } + + const GpuMemoryHandle GetGpuMemoryHandle() const { + return reinterpret_cast(const_cast(this)); + } + + static GpuMemory *Convert(GpuMemoryHandle handle) { return reinterpret_cast(handle); } + + ErrorCode ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize va_size, gpusize alignment); + ErrorCode FreeGpuVirtualAddress(gpusize va_start_address, gpusize va_size); + + ErrorCode MapGpuVirtualAddress(const gpusize map_addr, const gpusize size, gpusize offset = 0); + ErrorCode UnmapGpuVirtualAddress(const gpusize map_addr, const gpusize size, gpusize offset = 0); + + ErrorCode MakeResident(); + ErrorCode Evict(); + + ErrorCode ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags = SHARED_ALLOCATION_ALL_ACCESS); + ErrorCode ImportPhysicalHandle(int dmabuf_fd); + ~GpuMemory(); +protected: + explicit GpuMemory(WDDMDevice *device); +private: + ErrorCode CreatePhysicalMemory(); + ErrorCode FreePhysicalMemory(); + + uint64_t AdjustSize(gpusize size) const; +private: + friend class WDDMDevice; + + WDDMDevice *const device_; + + GpuMemoryDesc desc_; + + size_t num_allocations_; + WinAllocationHandle *alloc_handles_ptr_; + WinAllocationHandle alloc_handle_; // Optimization for num_allocations_ is 1 + + WinResourceHandle resource_; // Handle to a resource object that wraps the allocation. Used for shared resources + + DISALLOW_COPY_AND_ASSIGN(GpuMemory); +}; + +} // namespace thunk +} // namespace wsl + +#endif diff --git a/wddm/queue.h b/wddm/queue.h new file mode 100644 index 0000000000..8dfd1c9e4a --- /dev/null +++ b/wddm/queue.h @@ -0,0 +1,363 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// +#ifndef _WSL_INC_WDDM_QUEUE_H_ +#define _WSL_INC_WDDM_QUEUE_H_ + +#include +#include +#include +#include +#include +#include "impl/wddm/types.h" +#include "impl/wddm/device.h" +#include "impl/wddm/gpu_memory.h" +#include "hsa-runtime/inc/hsa_ext_amd.h" +#include "hsa-runtime/inc/amd_hsa_queue.h" +#include "hsa-runtime/inc/amd_hsa_signal.h" +#include "impl/wddm/cmd_util.h" + +namespace wsl { +namespace thunk { + +class Queue; +class WDDMDevice; + +class WDDMQueue { +public: + WDDMQueue(WDDMDevice *device, + uint64_t cmdbuf_addr, + uint32_t cmdbuf_size, + uint32_t engine, + bool use_hws = true) : + device(device), + context(0), + queue(0), + syncobj(0), + sync_addr(NULL), + cmdbuf(0), + cmdbuf_addr(cmdbuf_addr), + cmdbuf_size(cmdbuf_size), + queue_engine(engine), + use_hws(use_hws), + prio(thunk_proxy::kNormal) { + + } + + virtual ~WDDMQueue() { } + + virtual hsa_status_t Init(void) { return HSA_STATUS_SUCCESS; } + virtual hsa_status_t Fini(void) { return HSA_STATUS_SUCCESS; } + virtual void RingDoorbell() { } + virtual void* GetHsaQueueAddr(void) const { return reinterpret_cast(GetCmdbufAddr()); } + + hsa_status_t SwsInit(void); + hsa_status_t SwsFini(void); + hsa_status_t SwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value); + + hsa_status_t HwsInit(void); + hsa_status_t HwsFini(void); + hsa_status_t HwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value); + hsa_status_t SetPriority(hsa_amd_queue_priority_t priority); + + uint64_t *GetSyncAddr(void) const { return sync_addr; } + uint64_t GetCmdbufAddr(void) const { return cmdbuf_addr; } + + thunk_proxy::SchedLevel ConvertSchedLevel(hsa_amd_queue_priority_t prio) const { + switch (prio) { + case HSA_AMD_QUEUE_PRIORITY_LOW: + return thunk_proxy::kLow; + case HSA_AMD_QUEUE_PRIORITY_HIGH: + return thunk_proxy::kHigh; + case HSA_AMD_QUEUE_PRIORITY_NORMAL: + default: + return thunk_proxy::kNormal; + } + } + + WDDMDevice *device; + + D3DKMT_HANDLE context; + D3DKMT_HANDLE queue; + + D3DKMT_HANDLE syncobj; + uint64_t *sync_addr; + + GpuMemoryHandle cmdbuf; + uint64_t cmdbuf_addr; + uint32_t cmdbuf_size; + + GpuMemoryHandle queue_mem; + uint64_t queue_addr; + + uint32_t queue_engine; + + bool use_hws; + thunk_proxy::SchedLevel prio; +}; + +class ComputeQueue : public WDDMQueue { +public: + ComputeQueue(WDDMDevice *device, + void *ring, + uint64_t ring_size, + std::atomic *ring_wptr, + std::atomic *ring_rptr, + volatile int64_t *error_addr, + uint32_t cmdbuf_size, + uint32_t engine, + bool use_hws = true); + + ~ComputeQueue(); + + virtual hsa_status_t Init(void); + virtual hsa_status_t Fini(void); + virtual hsa_status_t Submit(void); + + void* GetRing(void) const { return ring; } + uint64_t GetRingSize(void) const { return ring_size; } + std::atomic* GetRingWptr(void) const { return ring_wptr; } + std::atomic* GetRingRptr(void) const { return ring_rptr; } + + uint64_t GetAqlWriteIndex(void) const { return cmdbuf_aql_frame_write_index; } + uint32_t GetAqlFrameSize(void) const { return cmdbuf_aql_frame_size; } + void* GetHsaQueueAddr(void) const { return ring; } + + bool IsInvalidPacket(void) const { + uint16_t *packet = (uint16_t *)((char *)ring + + (cmdbuf_aql_frame_write_index % ring_size) * 64); + return ((*packet >> HSA_PACKET_HEADER_TYPE) & ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1)) + == HSA_PACKET_TYPE_INVALID; + } + + hsa_status_t Process(void); + uint64_t * GetDoorbellPtr() const { return (uint64_t *)&doorbell_signal_value_; } + void RingDoorbell(); +private: + hsa_status_t KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet); + hsa_status_t BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or = false); + struct amd_aql_pm4_ib { + uint16_t header; + uint16_t ven_hdr; + uint32_t ib_jump_cmd[4]; + uint32_t dw_cnt_remain; + uint32_t reserved[8]; + hsa_signal_t completion_signal; + }; + hsa_status_t VendorSpecificAqlToPm4(char *cpu, amd_aql_pm4_ib *packet); + hsa_status_t SwitchAql2PM4(void); + + hsa_status_t PreSubmit(void); + hsa_status_t EndSubmit(void); + + void *ring; + uint64_t ring_size; + std::atomic *ring_wptr; + std::atomic *ring_rptr; + + // ib_start_addr is the current ib start address + uint64_t ib_start_addr; + + // ib_size is the current ib size. + uint64_t ib_size; + + // record the last submitted aql frame write index + uint64_t sync_point; + + uint64_t cmdbuf_aql_frame_write_index; + uint32_t cmdbuf_aql_frame_size; + + uint64_t *signal_addr_; + bool platform_atomic_support_; + bool needs_barrier; + bool ready_to_submit; + + CmdUtil cmd_util; + +private: + bool EnableProfiling() { + return AMD_HSA_BITS_GET(amd_queue_rocr_->queue_properties, AMD_QUEUE_PROPERTIES_ENABLE_PROFILING); + } + void HandleError(hsa_status_t status); + bool UpdateScratch(uint32_t private_segment_size, bool wave32); + + uint32_t UpdateIndexStride(uint32_t srd, bool wave32); + + void *ScratchBase() { return scratch_base_; } + + void AppendCmdbufSratchBaseOffset(int offset) { + scratch_base_offset_array_.push_back(offset); + } + + bool RelocateCmdbufScratchBase(uint64_t addr); + + uint32_t ScratchSizePerWave() { return scratch_size_per_wave_; } + uint64_t GetKernelObjAddr(uint64_t addr) const; + void InitScratchSRD(); + GpuMemoryHandle amd_queue_mem_; + amd_queue_t *amd_queue_; + amd_queue_t *amd_queue_rocr_; + uint64_t doorbell_signal_value_; + volatile std::atomic *error_code_; + std::thread aql_to_pm4_thread_; + bool thread_stop_; + std::mutex thread_cond_lock_; + std::condition_variable thread_cond_; + static void AqlToPm4Thread(ComputeQueue *queue); + + uint32_t scratch_waves_; + uint32_t scratch_size_per_wave_; + uint32_t scratch_size_; + void *scratch_base_; + GpuMemoryHandle scratch_mem_; + + std::vector scratch_base_offset_array_; +}; + +class SDMAQueue : public WDDMQueue { +public: + SDMAQueue(WDDMDevice *device, + void *ring, + uint64_t cmdbuf_size, + uint32_t engine, + bool use_hws = true); + + virtual ~SDMAQueue(); + + hsa_status_t Init(void); + hsa_status_t Fini(void); + hsa_status_t Submit(void); + + int PreparePacket(uint32_t offset, uint64_t size); + + void WaitQueue(void) { + device->CpuWait(&syncobj, &rptr_next, 1, false); + } + + uint64_t * GetRingWptr(void) { return &wptr_next_; } + uint64_t * GetRingRptr(void) { return WDDMQueue::GetSyncAddr(); } + uint64_t * GetDoorbellPtr() { return &doorbell_; } + void RingDoorbell(); + void* GetHsaQueueAddr(void) const { return reinterpret_cast(GetCmdbufAddr()); } + +private: + uint64_t wptr_next_; + uint64_t wptr_pre_; + uint64_t rptr_next; + uint64_t doorbell_; + std::vector> wptr_queue_; + uint64_t ib_size; + uint64_t ib_start_addr; + + std::thread thread_; + bool thread_stop_; + std::mutex thread_cond_lock_; + std::condition_variable thread_cond_; + static void SdmaThread(SDMAQueue *queue); + + struct SDMA_PKT_POLL_REGMEM { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved_0 : 10; + unsigned int hdp_flush : 1; + unsigned int reserved_1 : 1; + unsigned int func : 3; + unsigned int mem_poll : 1; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } ADDR_LO_UNION; + + union { + struct { + unsigned int addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } ADDR_HI_UNION; + + union { + struct { + unsigned int value : 32; + }; + unsigned int DW_3_DATA; + } VALUE_UNION; + + union { + struct { + unsigned int mask : 32; + }; + unsigned int DW_4_DATA; + } MASK_UNION; + + union { + struct { + unsigned int interval : 16; + unsigned int retry_count : 12; + unsigned int reserved_0 : 4; + }; + unsigned int DW_5_DATA; + } DW5_UNION; + }; + const unsigned int SDMA_OP_POLL_REGMEM = 8; + bool IsPollPacket(SDMA_PKT_POLL_REGMEM* pkt) { + return pkt->HEADER_UNION.op == SDMA_OP_POLL_REGMEM && + pkt->HEADER_UNION.mem_poll == 1 && + pkt->HEADER_UNION.func == 3; + } + uint32_t WrapIntoRocrRing(uint64_t idx) { return (idx & (cmdbuf_size - 1)); } +}; + +} // namespace thunk +} // namespace wsl + +#endif diff --git a/wddm/status.h b/wddm/status.h new file mode 100644 index 0000000000..528264c74e --- /dev/null +++ b/wddm/status.h @@ -0,0 +1,60 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_STATUS_H +#define _WSL_INC_WDDM_STATUS_H + +enum class ErrorCode { + Success, + DeviceLost, + UnSupported, + NotReady, + OutOfMemory, + OutOfGpuMemory, + OutOfHandleApeMemory, + Timeout, + SyscallFail, + InvalidateParams, + Unknown, +}; + +#endif diff --git a/wddm/thunks.h b/wddm/thunks.h new file mode 100644 index 0000000000..9783eb0177 --- /dev/null +++ b/wddm/thunks.h @@ -0,0 +1,232 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_THUNKS_H +#define _WSL_INC_WDDM_THUNKS_H + +#include "impl/wddm/status.h" +#include "impl/wddm/types.h" + +namespace wsl { +namespace thunk { + +inline ErrorCode TranslateNtStatus(NTSTATUS status) { + switch (status) { + case STATUS_SUCCESS: + return ErrorCode::Success; + case STATUS_PENDING: + return ErrorCode::NotReady; + case STATUS_NO_MEMORY: + return ErrorCode::OutOfMemory; + case STATUS_DEVICE_REMOVED: + return ErrorCode::DeviceLost; + case STATUS_GRAPHICS_NO_VIDEO_MEMORY: + return ErrorCode::OutOfGpuMemory; + case STATUS_TIMEOUT: + return ErrorCode::Timeout; + case STATUS_INVALID_PARAMETER: + return ErrorCode::InvalidateParams; + default: + break; + } + return ErrorCode::Unknown; +} + +namespace d3dthunk { + +typedef D3DKMT_CREATEALLOCATION CreateAllocationArgs; +typedef D3DKMT_CREATECONTEXT CreateContextArgs; +typedef D3DKMT_CREATECONTEXTVIRTUAL CreateContextVirtualArgs; +typedef D3DKMT_CREATEPAGINGQUEUE CreatePagingQueueArgs; +typedef D3DKMT_CREATESYNCHRONIZATIONOBJECT CreateSynchronizationObjectArgs; +typedef D3DKMT_CREATESYNCHRONIZATIONOBJECT2 CreateSynchronizationObject2Args; +typedef D3DKMT_ESCAPE EscapeArgs; +typedef D3DKMT_EVICT EvictArgs; +typedef D3DKMT_FREEGPUVIRTUALADDRESS FreeGpuVirtualAddressArgs; +typedef D3DKMT_LOCK LockArgs; +typedef D3DKMT_LOCK2 Lock2Args; +typedef D3DKMT_OPENRESOURCE OpenResourceArgs; +typedef D3DKMT_OPENRESOURCEFROMNTHANDLE OpenResourceFromNtHandleArgs; +typedef D3DKMT_QUERYADAPTERINFO QueryAdapterInfoArgs; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECT SignalSynchronizationObjectArgs; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECT2 SignalSynchronizationObject2Args; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMCPU SignalSynchronizationObjectFromCpuArgs; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU2 SignalSynchronizationObjectFromGpuArgs; +typedef D3DKMT_SUBMITCOMMAND SubmitCommandArgs; +typedef D3DKMT_UNLOCK UnlockArgs; +typedef D3DKMT_UNLOCK2 Unlock2Args; +typedef D3DKMT_UPDATEGPUVIRTUALADDRESS UpdateGpuVirtualAddressArgs; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECT WaitForSynchronizationObjectArgs; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECT2 WaitForSynchronizationObject2Args; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU WaitForSynchronizationObjectFromCpuArgs; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU WaitForSynchronizationObjectFromGpuArgs; +typedef D3DKMT_ACQUIREKEYEDMUTEX AcquireKeyedMutexArgs; +typedef D3DKMT_RELEASEKEYEDMUTEX ReleaseKeyedMutexArgs; +typedef D3DKMT_OPENKEYEDMUTEX OpenKeyedMutexArgs; +typedef D3DKMT_DESTROYKEYEDMUTEX DestroyKeyedMutexArgs; +typedef D3DKMT_QUERYVIDEOMEMORYINFO QueryVideoMemoryInfoArgs; +typedef D3DKMT_CREATEHWQUEUE CreateHwQueueArgs; +typedef D3DKMT_DESTROYHWQUEUE DestroyHwQueueArgs; +typedef D3DKMT_SUBMITCOMMANDTOHWQUEUE SubmitCommandToHwQueueArgs; +typedef D3DKMT_SUBMITPRESENTTOHWQUEUE SubmitPresentToHwQueueArgs; +typedef D3DKMT_SUBMITSIGNALSYNCOBJECTSTOHWQUEUE SubmitSignalSyncObjectsToHwQueueArgs; +typedef D3DKMT_SUBMITWAITFORSYNCOBJECTSTOHWQUEUE SubmitWaitForSyncObjectsToHwQueueArgs; +typedef D3DKMT_CREATESYNCFILE CreateSyncFileArgs; + +inline ErrorCode MapGpuVirtualAddress(D3DDDI_MAPGPUVIRTUALADDRESS *args) { + return TranslateNtStatus(D3DKMTMapGpuVirtualAddress(args)); +} + +inline ErrorCode CreateAllocation(CreateAllocationArgs *args) { + return TranslateNtStatus(D3DKMTCreateAllocation2(args)); +} + +inline ErrorCode DestroyAllocation( + WinDeviceHandle device, + WinResourceHandle resource, + size_t num_allocations, + const WinAllocationHandle *alloc_handles) { + + D3DKMT_DESTROYALLOCATION2 args{}; + + memset(&args, 0, sizeof(args)); + args.hDevice = device; + if (resource) { + args.hResource = resource; + } else { + args.phAllocationList = alloc_handles; + args.AllocationCount = num_allocations; + } + + return TranslateNtStatus(D3DKMTDestroyAllocation2(&args)); +} + +inline ErrorCode ReserveGpuVirtualAddress(D3DDDI_RESERVEGPUVIRTUALADDRESS *args) { + return TranslateNtStatus(D3DKMTReserveGpuVirtualAddress(args)); +} + +inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle, + gpusize size, + gpusize base_address, + gpusize *out_addr) { + D3DDDI_RESERVEGPUVIRTUALADDRESS args{}; + args.hPagingQueue = handle; + args.Size = size; + args.BaseAddress = base_address; + + auto code = ReserveGpuVirtualAddress(&args); + if (code == ErrorCode::Success) + *out_addr = args.VirtualAddress; + return code; +} + +inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle, + gpusize size, + gpusize minimum_address, + gpusize maximum_address, + gpusize *out_addr) { + D3DDDI_RESERVEGPUVIRTUALADDRESS args{}; + args.hPagingQueue = handle; + args.Size = size; + args.MinimumAddress = minimum_address; + args.MaximumAddress = maximum_address; + + auto code = ReserveGpuVirtualAddress(&args); + if (code == ErrorCode::Success) + *out_addr = args.VirtualAddress; + return code; +} + +inline ErrorCode FreeGpuVirtualAddress(FreeGpuVirtualAddressArgs *args) { + return TranslateNtStatus(D3DKMTFreeGpuVirtualAddress(args)); +} + +inline ErrorCode FreeGpuVirtualAddress(WinAdapterHandle handle, + gpusize base_address, + gpusize size) { + FreeGpuVirtualAddressArgs args{}; + args.hAdapter = handle; + args.Size = size; + args.BaseAddress = base_address; + return FreeGpuVirtualAddress(&args); +} + +inline ErrorCode MakeResident(D3DDDI_MAKERESIDENT *args) { + return TranslateNtStatus(D3DKMTMakeResident(args)); +} + +inline ErrorCode Evict(EvictArgs *args) { + return TranslateNtStatus(D3DKMTEvict(args)); +} + +inline ErrorCode ShareObjects(size_t num_allocations, + WinResourceHandle resource, + uint32_t flags, + int* dmabuf_fd) { + OBJECT_ATTRIBUTES obj_attr; + HANDLE nt_handle; + ErrorCode ret; + + InitializeObjectAttributes(&obj_attr, nullptr, OBJ_INHERIT, nullptr, nullptr); + ret = TranslateNtStatus(D3DKMTShareObjects(num_allocations, + &resource, &obj_attr, flags, &nt_handle)); + if (ret == ErrorCode::Success) + *dmabuf_fd = *(reinterpret_cast(&nt_handle)); + else + *dmabuf_fd = -1; + + return ret; +} + +inline ErrorCode QueryResourceInfoFromNtHandle(D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE *args) { + return TranslateNtStatus(D3DKMTQueryResourceInfoFromNtHandle(args)); +} + +inline ErrorCode OpenResourceFromNtHandle(D3DKMT_OPENRESOURCEFROMNTHANDLE *args) { + return TranslateNtStatus(D3DKMTOpenResourceFromNtHandle(args)); +} + +} // namespace d3dthunk +} // namespace thunk +} // namespace wsl + +#endif diff --git a/wddm/types.h b/wddm/types.h new file mode 100644 index 0000000000..0a3ca35ebc --- /dev/null +++ b/wddm/types.h @@ -0,0 +1,101 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_TYPES_H_ +#define _WSL_INC_WDDM_TYPES_H_ + +#include +#include +#include "impl/thunk_proxy/wddm_types.h" +// windows wchar is 16bit, but linux is 32bit +// seems libdxcore (not dxgkrnl.ko) convert thunk windows wchar to linux one +// so only accept 32bit wchar args. note driver private data structure still +// use 16bit wchar +#define WCHAR wchar_t +#define PCWSTR const wchar_t * +#include +#undef WCHAR +#undef PCWSTR + +using gpusize = uint64_t; // Used to specify GPU addresses and sizes of GPU allocations +using WinAllocationHandle = D3DKMT_HANDLE; +using WinResourceHandle = D3DKMT_HANDLE; +using WinContextHandle = D3DKMT_HANDLE; +using WinDeviceHandle = D3DKMT_HANDLE; +using WinAdapterHandle = D3DKMT_HANDLE; + +//reference dk/winnt.h +#define STANDARD_RIGHTS_REQUIRED (0x000F0000L) + +//reference dk/ntdef.h +#define OBJ_INHERIT (0x00000002L) +typedef WCHAR *PWCHAR, *LPWCH, *PWCH; +typedef struct _UNICODE_STRING { + USHORT Length; + USHORT MaximumLength; +#ifdef MIDL_PASS + [size_is(MaximumLength / 2), length_is((Length) / 2) ] USHORT * Buffer; +#else // MIDL_PASS + _Field_size_bytes_part_opt_(MaximumLength, Length) PWCH Buffer; +#endif // MIDL_PASS +} UNICODE_STRING; +typedef UNICODE_STRING *PUNICODE_STRING; +typedef const UNICODE_STRING *PCUNICODE_STRING; + +typedef struct _OBJECT_ATTRIBUTES { + ULONG Length; + HANDLE RootDirectory; + PUNICODE_STRING ObjectName; + ULONG Attributes; + PVOID SecurityDescriptor; + PVOID SecurityQualityOfService; +} OBJECT_ATTRIBUTES; +#define InitializeObjectAttributes( p, n, a, r, s ) { \ + (p)->Length = sizeof( OBJECT_ATTRIBUTES ); \ + (p)->RootDirectory = r; \ + (p)->Attributes = a; \ + (p)->ObjectName = n; \ + (p)->SecurityDescriptor = s; \ + (p)->SecurityQualityOfService = NULL; \ + } + +#endif \ No newline at end of file diff --git a/wddm/va_mgr.h b/wddm/va_mgr.h new file mode 100644 index 0000000000..675bfc3e39 --- /dev/null +++ b/wddm/va_mgr.h @@ -0,0 +1,86 @@ +#ifndef _WSL_INC_WDDM_VA_MGR_H_ +#define _WSL_INC_WDDM_VA_MGR_H_ + +#include +#include +#include "util/utils.h" + +namespace wsl { +namespace thunk { + +class VaMgr { +public: + VaMgr(uint64_t start, uint64_t size, uint64_t min_align); + ~VaMgr(); + + /* Allocate `bytes` VA, if `align` is not zero, the returned address is aligned by `align`. + * If `addr` parameter is not zero, try best to allocate VA from fixed address `addr`. + */ + uint64_t Alloc(uint64_t bytes, uint64_t align, uint64_t addr = 0); + + void Free(uint64_t addr); + +private: + uint64_t AllocImpl(uint64_t bytes, uint64_t align); + + struct Fragment { + using ptr = std::multimap::iterator; + ptr free_list_entry_; + + struct { + uint64_t size : 63; + bool is_free : 1; + }; + + Fragment() : size(0), is_free(false) {} + Fragment(ptr iterator, uint64_t len, bool is_free) + : free_list_entry_(iterator), size(len), is_free(is_free) {} + }; + + static inline Fragment make_fragment(typename Fragment::ptr iter, uint64_t len) { + return {iter, len, true}; + } + + inline Fragment make_fragment(uint64_t len) { return {free_list_.end(), len, false}; } + + static inline bool is_free(const Fragment& f) { return f.is_free; } + void set_used(Fragment& f) { + f.is_free = false; + f.free_list_entry_ = free_list_.end(); + } + static void set_free(Fragment& f, typename Fragment::ptr iter) { + f.free_list_entry_ = iter; + f.is_free = true; + } + + inline void remove_free_list_entry(Fragment& frag) { + if (frag.free_list_entry_ != free_list_.end()) { + free_list_.erase(frag.free_list_entry_); + frag.free_list_entry_ = free_list_.end(); + } + } + + inline void add_free_fragment(uint64_t size, uint64_t base) { + auto it = free_list_.insert(std::make_pair(size, base)); + frag_map_[base] = make_fragment(it, size); + } + + inline void add_used_fragment(uint64_t size, uint64_t base) { + frag_map_[base] = make_fragment(size); + } + // Indexed by size + std::multimap free_list_; + // Indexed by VA, each fragment has no overlap + std::map frag_map_; + + uint64_t min_align_; + + std::mutex lock_; // Mutex protecting allocation and free of va + + + DISALLOW_COPY_AND_ASSIGN(VaMgr); +}; + +} // namespace thunk +} // namespace wsl +#endif From 129da6526cb061579f37714a6dd38338bc364fe6 Mon Sep 17 00:00:00 2001 From: Longlong Yao Date: Thu, 12 Dec 2024 02:59:35 -0800 Subject: [PATCH 02/32] wsl/hsakmt: Add is_dgpu check for wddm device Reviewed-by: Flora Cui Signed-off-by: Longlong Yao Part-of: --- thunk_proxy/thunk_proxy.h | 1 + 1 file changed, 1 insertion(+) diff --git a/thunk_proxy/thunk_proxy.h b/thunk_proxy/thunk_proxy.h index fcdb5ea4de..fe5537bdb5 100644 --- a/thunk_proxy/thunk_proxy.h +++ b/thunk_proxy/thunk_proxy.h @@ -100,6 +100,7 @@ typedef struct { bool platform_atomic_support; void *adapter_info; void *adapter_ex_info; + void *adapter_proxy_info; } DeviceInfo; int EngineOrdinal(int engine, DeviceInfo *device_info); From 4b5a9a0f8cca5d7f27e5ad5938a230e9ab403d6e Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Tue, 24 Dec 2024 17:59:42 +0800 Subject: [PATCH 03/32] wsl/hsakmt: add ULARGE_INTEGER for updated d3dukmdt.h Signed-off-by: Flora Cui Reviewed-by: Horatio Zhang Part-of: --- thunk_proxy/wddm_types.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/thunk_proxy/wddm_types.h b/thunk_proxy/wddm_types.h index 83422a83d3..3fd3f69553 100644 --- a/thunk_proxy/wddm_types.h +++ b/thunk_proxy/wddm_types.h @@ -117,6 +117,20 @@ typedef union _LARGE_INTEGER { typedef LARGE_INTEGER *PLARGE_INTEGER; +typedef union _ULARGE_INTEGER { + struct { + ULONG LowPart; + ULONG HighPart; + } DUMMYSTRUCTNAME; + struct { + ULONG LowPart; + ULONG HighPart; + } u; + ULONGLONG QuadPart; +} ULARGE_INTEGER; + +typedef ULARGE_INTEGER *PULARGE_INTEGER; + typedef struct _LUID { ULONG LowPart; LONG HighPart; From b4e6cce2047d38a8abd935cd647450f41c88c680 Mon Sep 17 00:00:00 2001 From: lyndonli Date: Mon, 6 Jan 2025 16:44:41 +0800 Subject: [PATCH 04/32] wsl/hsakmt: Implement fetching of UUID Signed-off-by: lyndonli Reviewed-by: Shi.Leslie Reviewed-by: Flora Cui Part-of: --- thunk_proxy/thunk_proxy.h | 2 +- wddm/device.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/thunk_proxy/thunk_proxy.h b/thunk_proxy/thunk_proxy.h index fe5537bdb5..96de1caa10 100644 --- a/thunk_proxy/thunk_proxy.h +++ b/thunk_proxy/thunk_proxy.h @@ -55,7 +55,7 @@ typedef struct { int stepping; bool is_dgpu; char product_name[MAX_PATH]; - const char *uuid; + uint64_t uuid; AsicFamilyType family; uint32_t device_id; uint32_t wavefront_size; diff --git a/wddm/device.h b/wddm/device.h index 2aa751b662..30878923e9 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -83,7 +83,7 @@ public: int Stepping() { return device_info_.stepping; } bool IsDgpu() { return device_info_.is_dgpu; } const char *ProductName() { return device_info_.product_name; } - const char *Uuid() { return device_info_.uuid; } + uint64_t Uuid() { return device_info_.uuid; } thunk_proxy::AsicFamilyType GfxFamily() { return device_info_.family; } uint32_t DeviceId() { return device_info_.device_id; } uint32_t WavefrontSize() { return device_info_.wavefront_size; } From 2081ab01e6976d5061a0ec451fc71389e6f5ba65 Mon Sep 17 00:00:00 2001 From: tiancyin Date: Tue, 21 Jan 2025 10:13:33 +0800 Subject: [PATCH 05/32] wsl/hsakmt: implement ipc mem of rocr non-legacy mode The legacy mode means buffer sharing through KFD, KFD provide a buffer id to exporter, exporter pass it to importer, importer pass buffer id to KFD to query and import this buffer. The non-legcay mode relys on socket to pass dmabuf fd between processes. In hsa-runtime, the legcay mode is the default mode, setting environment variable HSA_ENABLE_IPC_MODE_LEGACY to 0 can force hsa-runtime to new mode code path. Reviewed-by: Flora Cui Reviewed-by: Longlong Yao Signed-off-by: tiancyin --- wddm/gpu_memory.h | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/wddm/gpu_memory.h b/wddm/gpu_memory.h index 4f5fe7e59c..378a104dd6 100644 --- a/wddm/gpu_memory.h +++ b/wddm/gpu_memory.h @@ -62,11 +62,28 @@ union GpuMemoryCreateFlags { uint64_t interprocess : 1; // physical buffer need share info between exporter and importer uint64_t locked : 1; // lock virtual address space into RAM, preventing that memory from being paged to the swap area uint64_t physical_contiguous : 1; // contiguous physical pages - uint64_t unused : 59; + uint64_t imported_vram_alloc_va : 1; // import buffer form dmabuf fd and allocate valid va (not from handle aperture) + uint64_t unused : 58; }; uint64_t reserved; }; +union GpuMemoryDescFlags { + struct { + uint32_t is_virtual : 1; + uint32_t is_shared : 1; + uint32_t is_external : 1; + uint32_t is_physical_only : 1; + uint32_t is_locked : 1; + uint32_t is_queue_referenced : 1; + uint32_t is_physical_contiguous : 1; + uint32_t is_imported_vram_alloc_va : 1; // 0 - va from handle aperture; 1 - va from local heap; + uint32_t unused : 24; + }; + + uint32_t reserved; +}; + struct GpuMemoryCreateInfo { GpuMemoryCreateInfo() { flags.reserved = 0; @@ -113,21 +130,7 @@ struct GpuMemoryDesc { gpusize alignment; gpusize handle_ape_addr; - union { - struct { - uint32_t is_virtual : 1; - uint32_t is_shared : 1; - uint32_t is_external : 1; - uint32_t is_physical_only : 1; - uint32_t is_locked : 1; - uint32_t is_queue_referenced : 1; - uint32_t is_physical_contiguous : 1; - uint32_t unused : 25; - }; - - uint32_t reserved; - } flags; - + GpuMemoryDescFlags flags; int mem_flags; int engine_flag; }; @@ -196,7 +199,7 @@ public: ErrorCode Evict(); ErrorCode ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags = SHARED_ALLOCATION_ALL_ACCESS); - ErrorCode ImportPhysicalHandle(int dmabuf_fd); + ErrorCode ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info); ~GpuMemory(); protected: explicit GpuMemory(WDDMDevice *device); From 5f219029c239a071bbb4666433611159f20c635d Mon Sep 17 00:00:00 2001 From: tiancyin Date: Mon, 24 Feb 2025 16:17:58 +0800 Subject: [PATCH 06/32] wsl/hsakmt: implement ipc signal IPC Signal only support sys ram backend and CPU&GPU both accessible, IPC Memory only support vram backend and only GPU accessible. Reviewed-by: Flora Cui Signed-off-by: tiancyin --- wddm/device.h | 13 ++++++++++++- wddm/gpu_memory.h | 11 ++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/wddm/device.h b/wddm/device.h index 30878923e9..5c6c5000f4 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -110,6 +110,7 @@ public: uint64_t PrivateApertureSize() { return device_info_.private_aperture_size; } uint64_t SharedApertureBase() { return device_info_.shared_aperture_base; } uint64_t SharedApertureSize() { return device_info_.shared_aperture_size; } + uint64_t SystemHeapSize() { return system_heap_space_size_; } uint32_t LdsSize() { return device_info_.lds_size; } uint64_t GPUCounterFrequency() { return device_info_.gpu_counter_frequency; } uint32_t GetSwsQueueSize(void) const { return device_info_.user_queue_size; } @@ -184,10 +185,18 @@ public: const thunk_proxy::DeviceInfo& DeviceInfo() const { return device_info_; } + ErrorCode ReserveIPCSysMem(gpusize size, + gpusize *out_gpu_virtual_addr, + gpusize alignment, + int &memfd, + bool lock=false); + + ErrorCode FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd); + ErrorCode ReserveGpuVirtualAddress(thunk_proxy::AllocDomain domain, gpusize hit_base_addr, gpusize size, - gpusize *out_gpu_virtual_addr, + gpusize *out_gpu_virtual_addr, gpusize alignment, bool lock=false); @@ -219,6 +228,8 @@ private: bool InitHandleApertureSpace(void); bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock=false); bool DecommitSystemHeapSpace(void* addr, int64_t size); + bool CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &fd, bool lock=false); + bool DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd); bool FreeLocalHeapSpace(void); void InitVaMgr(); void InitHandleApertureMgr(); diff --git a/wddm/gpu_memory.h b/wddm/gpu_memory.h index 378a104dd6..a04f103538 100644 --- a/wddm/gpu_memory.h +++ b/wddm/gpu_memory.h @@ -63,7 +63,8 @@ union GpuMemoryCreateFlags { uint64_t locked : 1; // lock virtual address space into RAM, preventing that memory from being paged to the swap area uint64_t physical_contiguous : 1; // contiguous physical pages uint64_t imported_vram_alloc_va : 1; // import buffer form dmabuf fd and allocate valid va (not from handle aperture) - uint64_t unused : 58; + uint64_t imported_sys_memfd : 1; // allocate system memory for IPC signal + uint64_t unused : 57; }; uint64_t reserved; }; @@ -78,7 +79,8 @@ union GpuMemoryDescFlags { uint32_t is_queue_referenced : 1; uint32_t is_physical_contiguous : 1; uint32_t is_imported_vram_alloc_va : 1; // 0 - va from handle aperture; 1 - va from local heap; - uint32_t unused : 24; + uint32_t is_imported_sys_memfd : 1; // 0 - ignored; 1 - va from system heap + uint32_t unused : 23; }; uint32_t reserved; @@ -103,7 +105,7 @@ struct GpuMemoryCreateInfo { gpusize alignment; int mem_flags; int engine_flag; - int dmabuf_fd; // Import from dmabuf + int dmabuf_fd; // Import from dmabuf void *user_ptr; gpusize va_hint; @@ -162,6 +164,7 @@ public: inline bool IsLocal() const { return desc_.domain == thunk_proxy::kLocal; } inline bool IsUserMemory() const { return desc_.domain == thunk_proxy::kUserMemory; } inline bool IsSystem() const { return desc_.domain == thunk_proxy::kSystem; } + inline bool IsSysMemFd() const { return desc_.flags.is_imported_sys_memfd; } inline bool IsUserQueue() const { return desc_.domain == thunk_proxy::kUserQueue; } inline bool IsPhysicalOnly() const { return desc_.flags.is_physical_only; } inline bool IsPhysicalContiguous() const { return desc_.flags.is_physical_contiguous; } @@ -221,6 +224,8 @@ private: WinResourceHandle resource_; // Handle to a resource object that wraps the allocation. Used for shared resources + int mem_fd_; // IPC sigal's sys mem fd + DISALLOW_COPY_AND_ASSIGN(GpuMemory); }; From 1c4f3e86fa7d0b615c393401aaf7d9157f24b925 Mon Sep 17 00:00:00 2001 From: Longlong Yao Date: Mon, 17 Mar 2025 17:42:16 +0800 Subject: [PATCH 07/32] libhsakmt: add support to get driver version number Signed-off-by: Longlong Yao Reviewed-by: lyndonli Part-of: --- thunk_proxy/thunk_proxy.h | 1 + 1 file changed, 1 insertion(+) diff --git a/thunk_proxy/thunk_proxy.h b/thunk_proxy/thunk_proxy.h index 96de1caa10..e0ff8dcdb3 100644 --- a/thunk_proxy/thunk_proxy.h +++ b/thunk_proxy/thunk_proxy.h @@ -101,6 +101,7 @@ typedef struct { void *adapter_info; void *adapter_ex_info; void *adapter_proxy_info; + uint32_t kmd_version; } DeviceInfo; int EngineOrdinal(int engine, DeviceInfo *device_info); From c01d09114b46245c4aa678a427c1d3788db5e199 Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Thu, 10 Apr 2025 16:21:17 +0800 Subject: [PATCH 08/32] wsl/libhsakmt: correct gfx family id Signed-off-by: Flora Cui Reviewed-by: Tianci Yin Part-of: --- thunk_proxy/thunk_proxy.h | 8 +------- wddm/device.h | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/thunk_proxy/thunk_proxy.h b/thunk_proxy/thunk_proxy.h index e0ff8dcdb3..516cb9af48 100644 --- a/thunk_proxy/thunk_proxy.h +++ b/thunk_proxy/thunk_proxy.h @@ -29,12 +29,6 @@ enum SchedLevel { kHigh = 2, }; -enum AsicFamilyType { - kPlumBONITO, - kNavi44, - kNavi48 -}; - struct HwsInfo { union { struct { @@ -56,7 +50,7 @@ typedef struct { bool is_dgpu; char product_name[MAX_PATH]; uint64_t uuid; - AsicFamilyType family; + uint32_t family; uint32_t device_id; uint32_t wavefront_size; uint32_t compute_unit_count; diff --git a/wddm/device.h b/wddm/device.h index 5c6c5000f4..c082daf0d3 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -84,7 +84,7 @@ public: bool IsDgpu() { return device_info_.is_dgpu; } const char *ProductName() { return device_info_.product_name; } uint64_t Uuid() { return device_info_.uuid; } - thunk_proxy::AsicFamilyType GfxFamily() { return device_info_.family; } + uint32_t GfxFamily() { return device_info_.family; } uint32_t DeviceId() { return device_info_.device_id; } uint32_t WavefrontSize() { return device_info_.wavefront_size; } uint32_t ComputeUnitCount() { return device_info_.compute_unit_count; } From 250d43508ead5a96d68cea04207ac2d247fb91c7 Mon Sep 17 00:00:00 2001 From: Longlong Yao Date: Wed, 25 Jun 2025 17:10:12 +0800 Subject: [PATCH 09/32] wsl/libhsakmt: reimplement GetClockCounters Signed-off-by: Longlong Yao Reviewed-by: Flora Cui Part-of: --- thunk_proxy/thunk_proxy.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/thunk_proxy/thunk_proxy.h b/thunk_proxy/thunk_proxy.h index 516cb9af48..0dceea5cd3 100644 --- a/thunk_proxy/thunk_proxy.h +++ b/thunk_proxy/thunk_proxy.h @@ -117,8 +117,6 @@ int CreateHwQueuePrivData(void **priv_data, D3DKMT_HANDLE context, bool FwManagedGfxState, SchedLevel level = kNormal); int CreateContextPrivData(void **priv_data, bool FwManagedGfxState); int CreatePowerOptPrivData(void **priv_data, bool restore); -int CreateCalibratedTimestampsPrivData(void **priv_data); -void QueryCalibratedTimestamps(void* priv, uint64_t* gpu, uint64_t* cpu); void DestroyPrivData(void *priv_data); } #endif From 61add174680933883a89bf1e0040805fec726a38 Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Wed, 25 Jun 2025 12:08:35 +0800 Subject: [PATCH 10/32] wsl/libhsakmt: add .NodeId() in WDDMDevice Signed-off-by: Flora Cui Reviewed-by: Tianci Yin Part-of: --- wddm/device.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wddm/device.h b/wddm/device.h index c082daf0d3..252373b565 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -75,9 +75,10 @@ class WDDMDevice { public: static constexpr size_t GpuMemoryChunkSize = 2 * (1ULL << 30); // 2 GB - WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid); + WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid, uint32_t node_id); ~WDDMDevice(); + int NodeId() const { return node_id_; } int Major() { return device_info_.major; } int Minor() { return device_info_.minor; } int Stepping() { return device_info_.stepping; } @@ -252,6 +253,7 @@ private: uint32_t cmdbuf_size_; uint32_t cmdbuf_aql_frame_size_; static const uint32_t cmdbuf_aql_frame_num_; + uint32_t node_id_; // device info thunk_proxy::DeviceInfo device_info_; From 6d941db5ec2171c14108615fb284091760ce77f6 Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Tue, 1 Jul 2025 10:42:35 +0800 Subject: [PATCH 11/32] wsl/libhsakmt: refactor ipc implementation Signed-off-by: Flora Cui Reviewed-by: Tianci Yin Part-of: --- wddm/gpu_memory.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/wddm/gpu_memory.h b/wddm/gpu_memory.h index a04f103538..976783f56b 100644 --- a/wddm/gpu_memory.h +++ b/wddm/gpu_memory.h @@ -62,9 +62,10 @@ union GpuMemoryCreateFlags { uint64_t interprocess : 1; // physical buffer need share info between exporter and importer uint64_t locked : 1; // lock virtual address space into RAM, preventing that memory from being paged to the swap area uint64_t physical_contiguous : 1; // contiguous physical pages - uint64_t imported_vram_alloc_va : 1; // import buffer form dmabuf fd and allocate valid va (not from handle aperture) - uint64_t imported_sys_memfd : 1; // allocate system memory for IPC signal - uint64_t unused : 57; + uint64_t sysmem_ipc_sig_importer : 1; // allocate system memory for IPC signal + uint64_t sysmem_ipc_sig_exporter : 1; // allocate system memory for IPC signal, prepare to export + uint64_t alloc_va : 1; // allocate va. 0 for vmem import + uint64_t unused : 56; }; uint64_t reserved; }; @@ -78,9 +79,12 @@ union GpuMemoryDescFlags { uint32_t is_locked : 1; uint32_t is_queue_referenced : 1; uint32_t is_physical_contiguous : 1; - uint32_t is_imported_vram_alloc_va : 1; // 0 - va from handle aperture; 1 - va from local heap; uint32_t is_imported_sys_memfd : 1; // 0 - ignored; 1 - va from system heap - uint32_t unused : 23; + uint32_t is_sysmem_exporter : 1; // allocate system memory for IPC signal, prepare to export + uint32_t is_va_required :1; + uint32_t is_imported_vram_vmem :1; + uint32_t is_imported_vram_ipc :1; + uint32_t unused : 20; }; uint32_t reserved; @@ -171,6 +175,7 @@ public: inline bool IsVirtual() const { return desc_.flags.is_virtual; } inline bool IsShared() const { return desc_.flags.is_shared; } inline bool IsExternal() const { return desc_.flags.is_external; } + inline bool IsVaAllocated() const { return desc_.flags.is_va_required; } inline uint32_t Flags() const { return desc_.flags.reserved; } inline int GetAllocInfo() const { return desc_.mem_flags; } From a53f1a7c1e7f0fc706902204b11a2f423f83f41e Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Sat, 5 Jul 2025 12:38:07 +0800 Subject: [PATCH 12/32] wsl/libhsakmt: add same process check for ipc buffer Signed-off-by: Flora Cui Reviewed-by: Tianci Yin Part-of: --- wddm/device.h | 2 +- wddm/gpu_memory.h | 10 ++++++++-- wddm/status.h | 1 + 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/wddm/device.h b/wddm/device.h index 252373b565..1d78acf925 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -205,7 +205,7 @@ public: gpusize base_addr, gpusize size); - ErrorCode CreateGpuMemory(const GpuMemoryCreateInfo &create_info, GpuMemory **gpu_mem); + ErrorCode CreateGpuMemory(const GpuMemoryCreateInfo &create_info, GpuMemory **gpu_mem, gpusize *gpu_va = nullptr); ErrorCode HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr); void HandleApertureFree(gpusize gpu_addr); diff --git a/wddm/gpu_memory.h b/wddm/gpu_memory.h index 976783f56b..82e61f1b7f 100644 --- a/wddm/gpu_memory.h +++ b/wddm/gpu_memory.h @@ -84,7 +84,8 @@ union GpuMemoryDescFlags { uint32_t is_va_required :1; uint32_t is_imported_vram_vmem :1; uint32_t is_imported_vram_ipc :1; - uint32_t unused : 20; + uint32_t is_imported_from_same_process : 3; // imported from same process, record shared cnt + uint32_t unused : 17; }; uint32_t reserved; @@ -148,6 +149,8 @@ struct SharedHandleInfo { uint64_t size; uint32_t flags; int mem_flags; + pid_t pid; + gpusize gpu_addr; }; using GpuMemoryHandle = void *; @@ -187,6 +190,9 @@ public: inline void GetQueueReference() { desc_.flags.is_queue_referenced = 1; } inline void PutQueueReference() { desc_.flags.is_queue_referenced = 0; } inline bool IsQueueReferenced() const { return desc_.flags.is_queue_referenced; } + inline void IncSharedReference() { desc_.flags.is_imported_from_same_process++; } + inline uint32_t DecSharedReference() { return (desc_.flags.is_imported_from_same_process == 0) ? 0 : --desc_.flags.is_imported_from_same_process; } + inline bool IsSharedFromSameProcess() const { return desc_.flags.is_imported_from_same_process > 0; } WinAllocationHandle GetAllocationHandle(size_t index) const { return alloc_handles_ptr_[index]; } size_t NumChunks() const { return num_allocations_; } @@ -207,7 +213,7 @@ public: ErrorCode Evict(); ErrorCode ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags = SHARED_ALLOCATION_ALL_ACCESS); - ErrorCode ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info); + ErrorCode ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info, gpusize *gpu_addr = nullptr); ~GpuMemory(); protected: explicit GpuMemory(WDDMDevice *device); diff --git a/wddm/status.h b/wddm/status.h index 528264c74e..0efd9559fd 100644 --- a/wddm/status.h +++ b/wddm/status.h @@ -54,6 +54,7 @@ enum class ErrorCode { Timeout, SyscallFail, InvalidateParams, + SameProcessSameDevice, Unknown, }; From 602ed1aff84280353a48cb0017224c304d287223 Mon Sep 17 00:00:00 2001 From: tiancyin Date: Fri, 27 Jun 2025 09:08:19 +0800 Subject: [PATCH 13/32] wsl/libhsakmt: move local heap and va_Mgr from device to thunk runtime In multi-GPU, local heap space is shared between all GPUs, not belongs to specific one GPU, so move it from wddm device (which presents a specific GPU) to thunk runtime which has gloable view, can manage local heap for all GPUs. Reviewed-by: Flora Cui Signed-off-by: tiancyin --- wddm/device.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/wddm/device.h b/wddm/device.h index 1d78acf925..1d7b3d1cb2 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -183,6 +183,7 @@ public: D3DKMT_HANDLE PagingFence() const { return page_syncobj_; } D3DKMT_HANDLE DeviceHandle() const { return device_; } LUID GetLuid() const { return adapter_luid_; } + D3DKMT_HANDLE GetAdapter() const { return adapter_; } const thunk_proxy::DeviceInfo& DeviceInfo() const { return device_info_; } @@ -225,14 +226,11 @@ private: void InitCmdbufInfo(void); bool ReserveSystemHeapSpace(void); bool FreeSystemHeapSpace(void); - bool ReserveLocalHeapSpace(void); bool InitHandleApertureSpace(void); bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock=false); bool DecommitSystemHeapSpace(void* addr, int64_t size); bool CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &fd, bool lock=false); bool DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd); - bool FreeLocalHeapSpace(void); - void InitVaMgr(); void InitHandleApertureMgr(); D3DKMT_HANDLE adapter_; @@ -246,8 +244,6 @@ private: uint64_t handle_aperture_start_; uint64_t handle_aperture_size_; - uint64_t local_heap_space_start_; - uint64_t local_heap_space_size_; uint64_t system_heap_space_start_; uint64_t system_heap_space_size_; uint32_t cmdbuf_size_; @@ -257,7 +253,6 @@ private: // device info thunk_proxy::DeviceInfo device_info_; - std::unique_ptr local_va_mgr_; std::unique_ptr handle_aperture_mgr_; //CmdUtil cmd_util; }; From 557f888e1ce2a0972b42be5cd341ba03bd87fc31 Mon Sep 17 00:00:00 2001 From: tiancyin Date: Fri, 27 Jun 2025 16:47:51 +0800 Subject: [PATCH 14/32] wsl/libhsakmt: move system heap from device to thunk runtime In multi-GPU, system heap space is shared between all GPUs, not belongs to specific one GPU, so move it from wddm device (which presents a specific GPU) to thunk runtime which has gloable view, can manage system heap for all GPUs. Introduce a new va_Mgr instance to manage system heap, since local heap and system heap both comply with SVM(Shared Virtual Memory), without this new mgr, every allocation has to call KMD at least once (each GPU needs a call) to allocate GPU VA, the new mgr manage the space itself, no longer call KMD. Reviewed-by: Flora Cui Signed-off-by: tiancyin --- wddm/device.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/wddm/device.h b/wddm/device.h index 1d7b3d1cb2..40e2632ec1 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -111,7 +111,6 @@ public: uint64_t PrivateApertureSize() { return device_info_.private_aperture_size; } uint64_t SharedApertureBase() { return device_info_.shared_aperture_base; } uint64_t SharedApertureSize() { return device_info_.shared_aperture_size; } - uint64_t SystemHeapSize() { return system_heap_space_size_; } uint32_t LdsSize() { return device_info_.lds_size; } uint64_t GPUCounterFrequency() { return device_info_.gpu_counter_frequency; } uint32_t GetSwsQueueSize(void) const { return device_info_.user_queue_size; } @@ -224,8 +223,6 @@ private: void SetPowerOptimization(bool restore); void InitCmdbufInfo(void); - bool ReserveSystemHeapSpace(void); - bool FreeSystemHeapSpace(void); bool InitHandleApertureSpace(void); bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock=false); bool DecommitSystemHeapSpace(void* addr, int64_t size); @@ -244,8 +241,6 @@ private: uint64_t handle_aperture_start_; uint64_t handle_aperture_size_; - uint64_t system_heap_space_start_; - uint64_t system_heap_space_size_; uint32_t cmdbuf_size_; uint32_t cmdbuf_aql_frame_size_; static const uint32_t cmdbuf_aql_frame_num_; From 593e919bcd4d9e5d22b0a9b35196a6167a80d105 Mon Sep 17 00:00:00 2001 From: tiancyin Date: Wed, 2 Jul 2025 13:41:57 +0800 Subject: [PATCH 15/32] wsl/libhsakmt: move handle aperture from device to thunk runtime In multi-GPU, handle aperture is shared between all GPUs, not belongs to specific one GPU, so move it from wddm device (which presents a specific GPU) to thunk runtime which has gloable view, can manage handle aperture for all GPUs. Reviewed-by: Flora Cui Signed-off-by: tiancyin --- wddm/device.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/wddm/device.h b/wddm/device.h index 40e2632ec1..99472b91ad 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -206,8 +206,6 @@ public: gpusize size); ErrorCode CreateGpuMemory(const GpuMemoryCreateInfo &create_info, GpuMemory **gpu_mem, gpusize *gpu_va = nullptr); - ErrorCode HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr); - void HandleApertureFree(gpusize gpu_addr); private: bool ParseDeviceInfo(void); @@ -223,12 +221,10 @@ private: void SetPowerOptimization(bool restore); void InitCmdbufInfo(void); - bool InitHandleApertureSpace(void); bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock=false); bool DecommitSystemHeapSpace(void* addr, int64_t size); bool CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &fd, bool lock=false); bool DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd); - void InitHandleApertureMgr(); D3DKMT_HANDLE adapter_; LUID adapter_luid_; @@ -239,8 +235,6 @@ private: uint64_t *page_fence_addr_; std::atomic page_fence_value_; - uint64_t handle_aperture_start_; - uint64_t handle_aperture_size_; uint32_t cmdbuf_size_; uint32_t cmdbuf_aql_frame_size_; static const uint32_t cmdbuf_aql_frame_num_; @@ -248,7 +242,6 @@ private: // device info thunk_proxy::DeviceInfo device_info_; - std::unique_ptr handle_aperture_mgr_; //CmdUtil cmd_util; }; From 3e40beb68cfe0abe13568f8f891adf00d7ee637c Mon Sep 17 00:00:00 2001 From: tiancyin Date: Mon, 30 Jun 2025 09:21:02 +0800 Subject: [PATCH 16/32] wsl/libhsakmt: move ReserveGpuVirtualAddress from device to thunk runtime For multi-GPU supporting, local heap and system heap managers are implemented in thunk runtime, so the heap allocation function ReserveGpuVirtualAddress should be moved to runtime too. Reviewed-by: Flora Cui Signed-off-by: tiancyin --- wddm/device.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/wddm/device.h b/wddm/device.h index 99472b91ad..f997fec55c 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -194,17 +194,6 @@ public: ErrorCode FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd); - ErrorCode ReserveGpuVirtualAddress(thunk_proxy::AllocDomain domain, - gpusize hit_base_addr, - gpusize size, - gpusize *out_gpu_virtual_addr, - gpusize alignment, - bool lock=false); - - ErrorCode FreeGpuVirtualAddress(thunk_proxy::AllocDomain domain, - gpusize base_addr, - gpusize size); - ErrorCode CreateGpuMemory(const GpuMemoryCreateInfo &create_info, GpuMemory **gpu_mem, gpusize *gpu_va = nullptr); private: @@ -221,8 +210,6 @@ private: void SetPowerOptimization(bool restore); void InitCmdbufInfo(void); - bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock=false); - bool DecommitSystemHeapSpace(void* addr, int64_t size); bool CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &fd, bool lock=false); bool DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd); From 575e25b7e4cba0d4e289327b15f1235198f27f47 Mon Sep 17 00:00:00 2001 From: tiancyin Date: Mon, 30 Jun 2025 12:11:48 +0800 Subject: [PATCH 17/32] wsl/libhsakmt: move IPC functions from device to thunk runtime IPC use system memory, it has nothing to do with wddm device. Reviewed-by: Flora Cui Signed-off-by: tiancyin --- wddm/device.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/wddm/device.h b/wddm/device.h index f997fec55c..8e4f783f9a 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -186,14 +186,6 @@ public: const thunk_proxy::DeviceInfo& DeviceInfo() const { return device_info_; } - ErrorCode ReserveIPCSysMem(gpusize size, - gpusize *out_gpu_virtual_addr, - gpusize alignment, - int &memfd, - bool lock=false); - - ErrorCode FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd); - ErrorCode CreateGpuMemory(const GpuMemoryCreateInfo &create_info, GpuMemory **gpu_mem, gpusize *gpu_va = nullptr); private: @@ -210,8 +202,6 @@ private: void SetPowerOptimization(bool restore); void InitCmdbufInfo(void); - bool CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &fd, bool lock=false); - bool DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd); D3DKMT_HANDLE adapter_; LUID adapter_luid_; From 887056d64a73c2a76e1c8dc278de82715757a06c Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Mon, 28 Jul 2025 17:42:16 +0800 Subject: [PATCH 18/32] wsl/libhsakmt: remove redundant #include "libhsakmt.h" move libhsakmt.h inclusion to he makefile Signed-off-by: Flora Cui Reviewed-by: Tianci Yin Part-of: --- wddm/cmd_util.h | 1 - 1 file changed, 1 deletion(-) diff --git a/wddm/cmd_util.h b/wddm/cmd_util.h index f69d45242a..5be6c4764f 100644 --- a/wddm/cmd_util.h +++ b/wddm/cmd_util.h @@ -9,7 +9,6 @@ #include "hsa-runtime/inc/amd_hsa_kernel_code.h" #include "impl/pm4_cmds.h" #include "util/utils.h" -#include "libhsakmt.h" namespace wsl { namespace thunk { From 838421c540ff0d549788069953bd2f49aa7bb7be Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Fri, 25 Jul 2025 11:23:04 +0800 Subject: [PATCH 19/32] wsl/libhsakmt: refactor check for supported device Signed-off-by: Flora Cui Reviewed-by: Tianci Yin Part-of: --- thunk_proxy/thunk_proxy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thunk_proxy/thunk_proxy.h b/thunk_proxy/thunk_proxy.h index 0dceea5cd3..21bb983dda 100644 --- a/thunk_proxy/thunk_proxy.h +++ b/thunk_proxy/thunk_proxy.h @@ -102,7 +102,7 @@ int EngineOrdinal(int engine, DeviceInfo *device_info); bool GetHwsEnabled(int engine, DeviceInfo *device_info); bool ShouldDisableGpuTimeout(int engine, DeviceInfo *device_info); bool ParseAdapterInfo(D3DKMT_HANDLE adapter, DeviceInfo *device_info); -bool QueryAdapterSupported(D3DKMT_HANDLE adapter); +bool QueryAdapterSupported(unsigned int device_id); uint32_t QueueEngine2EngineFlag(uint32_t queue_engine); void SetAllocationInfo(void *data, uint64_t size, AllocDomain domain, From 70b9951b0ca3b679dd289886662d3487567f6c96 Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Fri, 25 Jul 2025 11:42:16 +0800 Subject: [PATCH 20/32] wsl/libhsakmt: refactor WDDMDevice creation Signed-off-by: Flora Cui Reviewed-by: Tianci Yin Part-of: --- wddm/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wddm/device.h b/wddm/device.h index 8e4f783f9a..2ba615cfa9 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -222,7 +222,7 @@ private: //CmdUtil cmd_util; }; -NTSTATUS WDDMGetAdapters(D3DKMT_ADAPTERINFO *&adapters, int &num_adapters); +NTSTATUS WDDMCreateDevices(std::vector &devices); } // namespace thunk } // namespace wsl From 0e8f794b1c7ae686825db381ba99add8120d4ef2 Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Thu, 31 Jul 2025 16:36:31 +0800 Subject: [PATCH 21/32] wsl/libhsakmt: simplify adapter_info Signed-off-by: Flora Cui Reviewed-by: Longlong Yao Part-of: --- thunk_proxy/thunk_proxy.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/thunk_proxy/thunk_proxy.h b/thunk_proxy/thunk_proxy.h index 21bb983dda..7fa538a731 100644 --- a/thunk_proxy/thunk_proxy.h +++ b/thunk_proxy/thunk_proxy.h @@ -93,8 +93,6 @@ typedef struct { bool state_shadowing_by_cpfw; bool platform_atomic_support; void *adapter_info; - void *adapter_ex_info; - void *adapter_proxy_info; uint32_t kmd_version; } DeviceInfo; From e2a1f0c7fc614d548ca1a0e39cea6401ca44bb92 Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Fri, 1 Aug 2025 15:18:51 +0800 Subject: [PATCH 22/32] wsl/libhsakmt: refactor handling of kmd priv data Signed-off-by: Flora Cui Reviewed-by: Longlong Yao Part-of: --- thunk_proxy/thunk_proxy.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/thunk_proxy/thunk_proxy.h b/thunk_proxy/thunk_proxy.h index 7fa538a731..3eace0b6e5 100644 --- a/thunk_proxy/thunk_proxy.h +++ b/thunk_proxy/thunk_proxy.h @@ -105,16 +105,17 @@ bool QueryAdapterSupported(unsigned int device_id); uint32_t QueueEngine2EngineFlag(uint32_t queue_engine); void SetAllocationInfo(void *data, uint64_t size, AllocDomain domain, uint64_t addr, uint32_t mem_flags, uint32_t engine_flag, const DeviceInfo &device_info); -bool CreatePrivateAllocInfo(int num_handles, void **ppdrv_priv, void **ppalloc_priv, - int *pdrv_priv_data_size, int *palloc_priv_data_size); -void DestroyPrivateAllocInfo(void *drv_priv, void *alloc_priv); +void GetAllocPrivDataSize(int *priv_drv_data_size, int *priv_alloc_data_size); +void FillinAllocPrivDrvData(void *drv_priv, int priv_alloc_data_size); -int CreateSubmitPrivData(void **priv_data, D3DKMT_HANDLE queue, uint64_t command_addr, +int GetSubmitPrivDataSize(); +void FillinSubmitPrivData(void *priv_data, D3DKMT_HANDLE queue, uint64_t command_addr, uint64_t command_size, bool is_hw_queue); -int CreateHwQueuePrivData(void **priv_data, D3DKMT_HANDLE context, - bool FwManagedGfxState, SchedLevel level = kNormal); -int CreateContextPrivData(void **priv_data, bool FwManagedGfxState); -int CreatePowerOptPrivData(void **priv_data, bool restore); -void DestroyPrivData(void *priv_data); +int GetHwQueuePrivDataSize(); +void FillinHwQueuePrivData(void *priv_data, bool FwManagedGfxState, SchedLevel level = kNormal); +int GetContextPrivDataSize(); +void FillinContextPrivData(void *priv_data, bool FwManagedGfxState); +int GetPowerOptPrivDataSize(); +void FillinPowerOptPrivData(void *priv_data, bool restore); } #endif From 99da7e60eca57bfe0de54e518e35edbf8ea6e105 Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Wed, 16 Jul 2025 11:22:13 +0800 Subject: [PATCH 23/32] wsl/libhsakmt: adapt to the new check for kernel object Signed-off-by: Flora Cui Reviewed-by: Longlong Yao Part-of: --- wddm/gpu_memory.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/wddm/gpu_memory.h b/wddm/gpu_memory.h index 82e61f1b7f..9703a6d2c7 100644 --- a/wddm/gpu_memory.h +++ b/wddm/gpu_memory.h @@ -65,7 +65,8 @@ union GpuMemoryCreateFlags { uint64_t sysmem_ipc_sig_importer : 1; // allocate system memory for IPC signal uint64_t sysmem_ipc_sig_exporter : 1; // allocate system memory for IPC signal, prepare to export uint64_t alloc_va : 1; // allocate va. 0 for vmem import - uint64_t unused : 56; + uint64_t blit_kernel_object : 1; // allocate executable blit kernel object + uint64_t unused : 55; }; uint64_t reserved; }; @@ -85,7 +86,8 @@ union GpuMemoryDescFlags { uint32_t is_imported_vram_vmem :1; uint32_t is_imported_vram_ipc :1; uint32_t is_imported_from_same_process : 3; // imported from same process, record shared cnt - uint32_t unused : 17; + uint32_t is_blit_kernel_object : 1; // blit kernel object + uint32_t unused : 16; }; uint32_t reserved; @@ -179,6 +181,7 @@ public: inline bool IsShared() const { return desc_.flags.is_shared; } inline bool IsExternal() const { return desc_.flags.is_external; } inline bool IsVaAllocated() const { return desc_.flags.is_va_required; } + inline bool IsBlitKernelObject() const { return desc_.flags.is_blit_kernel_object; } inline uint32_t Flags() const { return desc_.flags.reserved; } inline int GetAllocInfo() const { return desc_.mem_flags; } From 25c2b740378bec8b2bbf7dd5a73df6d003dcb597 Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Mon, 23 Jun 2025 16:39:54 +0800 Subject: [PATCH 24/32] librocdxg: add rocr header files Signed-off-by: Flora Cui --- hsa/Brig.h | 1131 +++++++ hsa/amd_hsa_common.h | 91 + hsa/amd_hsa_elf.h | 467 +++ hsa/amd_hsa_kernel_code.h | 270 ++ hsa/amd_hsa_queue.h | 154 + hsa/amd_hsa_signal.h | 79 + hsa/hsa.h | 5752 +++++++++++++++++++++++++++++++++ hsa/hsa_amd_tool.h | 91 + hsa/hsa_api_trace.h | 585 ++++ hsa/hsa_api_trace_version.h | 70 + hsa/hsa_ext_amd.h | 3675 +++++++++++++++++++++ hsa/hsa_ext_finalize.h | 531 +++ hsa/hsa_ext_image.h | 1515 +++++++++ hsa/hsa_ven_amd_aqlprofile.h | 488 +++ hsa/hsa_ven_amd_loader.h | 667 ++++ hsa/hsa_ven_amd_pc_sampling.h | 416 +++ 16 files changed, 15982 insertions(+) create mode 100644 hsa/Brig.h create mode 100644 hsa/amd_hsa_common.h create mode 100644 hsa/amd_hsa_elf.h create mode 100644 hsa/amd_hsa_kernel_code.h create mode 100644 hsa/amd_hsa_queue.h create mode 100644 hsa/amd_hsa_signal.h create mode 100644 hsa/hsa.h create mode 100644 hsa/hsa_amd_tool.h create mode 100644 hsa/hsa_api_trace.h create mode 100644 hsa/hsa_api_trace_version.h create mode 100644 hsa/hsa_ext_amd.h create mode 100644 hsa/hsa_ext_finalize.h create mode 100644 hsa/hsa_ext_image.h create mode 100644 hsa/hsa_ven_amd_aqlprofile.h create mode 100644 hsa/hsa_ven_amd_loader.h create mode 100644 hsa/hsa_ven_amd_pc_sampling.h diff --git a/hsa/Brig.h b/hsa/Brig.h new file mode 100644 index 0000000000..4f34bd1d50 --- /dev/null +++ b/hsa/Brig.h @@ -0,0 +1,1131 @@ +// University of Illinois/NCSA +// Open Source License +// +// Copyright (c) 2013-2015, Advanced Micro Devices, Inc. +// All rights reserved. +// +// Developed by: +// +// HSA Team +// +// Advanced Micro Devices, Inc +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal with +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimers in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the names of the LLVM Team, University of Illinois at +// Urbana-Champaign, nor the names of its contributors may be used to +// endorse or promote products derived from this Software without specific +// prior written permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +// SOFTWARE. + +#ifndef INCLUDED_BRIG_H +#define INCLUDED_BRIG_H + +#include /* size_t */ +#include /* uintXX_t */ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/*========================================================================================*/ +/* =======================================================================================*/ +/* =======================================================================================*/ +/* =======================================================================================*/ + +typedef uint32_t BrigCodeOffset32_t; +typedef uint32_t BrigOperandOffset32_t; +typedef uint32_t BrigDataOffset32_t; + +typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t; +typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t; +typedef BrigDataOffset32_t BrigDataOffsetString32_t; + +typedef uint32_t BrigVersion32_t; +enum BrigVersion { + BRIG_VERSION_HSAIL_MAJOR = 1, + BRIG_VERSION_HSAIL_MINOR = 0, + BRIG_VERSION_BRIG_MAJOR = 1, + BRIG_VERSION_BRIG_MINOR = 0 +}; + +typedef uint16_t BrigKind16_t; +enum BrigKind { + BRIG_KIND_NONE = 0x0000, + + BRIG_KIND_DIRECTIVE_BEGIN = 0x1000, + BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000, + BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001, + BRIG_KIND_DIRECTIVE_COMMENT = 0x1002, + BRIG_KIND_DIRECTIVE_CONTROL = 0x1003, + BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004, + BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005, + BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006, + BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007, + BRIG_KIND_DIRECTIVE_KERNEL = 0x1008, + BRIG_KIND_DIRECTIVE_LABEL = 0x1009, + BRIG_KIND_DIRECTIVE_LOC = 0x100a, + BRIG_KIND_DIRECTIVE_MODULE = 0x100b, + BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c, + BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d, + BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e, + BRIG_KIND_DIRECTIVE_END = 0x100f, + + BRIG_KIND_INST_BEGIN = 0x2000, + BRIG_KIND_INST_ADDR = 0x2000, + BRIG_KIND_INST_ATOMIC = 0x2001, + BRIG_KIND_INST_BASIC = 0x2002, + BRIG_KIND_INST_BR = 0x2003, + BRIG_KIND_INST_CMP = 0x2004, + BRIG_KIND_INST_CVT = 0x2005, + BRIG_KIND_INST_IMAGE = 0x2006, + BRIG_KIND_INST_LANE = 0x2007, + BRIG_KIND_INST_MEM = 0x2008, + BRIG_KIND_INST_MEM_FENCE = 0x2009, + BRIG_KIND_INST_MOD = 0x200a, + BRIG_KIND_INST_QUERY_IMAGE = 0x200b, + BRIG_KIND_INST_QUERY_SAMPLER = 0x200c, + BRIG_KIND_INST_QUEUE = 0x200d, + BRIG_KIND_INST_SEG = 0x200e, + BRIG_KIND_INST_SEG_CVT = 0x200f, + BRIG_KIND_INST_SIGNAL = 0x2010, + BRIG_KIND_INST_SOURCE_TYPE = 0x2011, + BRIG_KIND_INST_END = 0x2012, + + BRIG_KIND_OPERAND_BEGIN = 0x3000, + BRIG_KIND_OPERAND_ADDRESS = 0x3000, + BRIG_KIND_OPERAND_ALIGN = 0x3001, + BRIG_KIND_OPERAND_CODE_LIST = 0x3002, + BRIG_KIND_OPERAND_CODE_REF = 0x3003, + BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004, + BRIG_KIND_OPERAND_RESERVED = 0x3005, + BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006, + BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007, + BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008, + BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009, + BRIG_KIND_OPERAND_REGISTER = 0x300a, + BRIG_KIND_OPERAND_STRING = 0x300b, + BRIG_KIND_OPERAND_WAVESIZE = 0x300c, + BRIG_KIND_OPERAND_END = 0x300d +}; + +typedef uint8_t BrigAlignment8_t; +enum BrigAlignment { + BRIG_ALIGNMENT_NONE = 0, + BRIG_ALIGNMENT_1 = 1, + BRIG_ALIGNMENT_2 = 2, + BRIG_ALIGNMENT_4 = 3, + BRIG_ALIGNMENT_8 = 4, + BRIG_ALIGNMENT_16 = 5, + BRIG_ALIGNMENT_32 = 6, + BRIG_ALIGNMENT_64 = 7, + BRIG_ALIGNMENT_128 = 8, + BRIG_ALIGNMENT_256 = 9, + BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_256 +}; + +typedef uint8_t BrigAllocation8_t; +enum BrigAllocation { + BRIG_ALLOCATION_NONE = 0, + BRIG_ALLOCATION_PROGRAM = 1, + BRIG_ALLOCATION_AGENT = 2, + BRIG_ALLOCATION_AUTOMATIC = 3 +}; + +typedef uint8_t BrigAluModifier8_t; +enum BrigAluModifierMask { + BRIG_ALU_FTZ = 1 +}; + +typedef uint8_t BrigAtomicOperation8_t; +enum BrigAtomicOperation { + BRIG_ATOMIC_ADD = 0, + BRIG_ATOMIC_AND = 1, + BRIG_ATOMIC_CAS = 2, + BRIG_ATOMIC_EXCH = 3, + BRIG_ATOMIC_LD = 4, + BRIG_ATOMIC_MAX = 5, + BRIG_ATOMIC_MIN = 6, + BRIG_ATOMIC_OR = 7, + BRIG_ATOMIC_ST = 8, + BRIG_ATOMIC_SUB = 9, + BRIG_ATOMIC_WRAPDEC = 10, + BRIG_ATOMIC_WRAPINC = 11, + BRIG_ATOMIC_XOR = 12, + BRIG_ATOMIC_WAIT_EQ = 13, + BRIG_ATOMIC_WAIT_NE = 14, + BRIG_ATOMIC_WAIT_LT = 15, + BRIG_ATOMIC_WAIT_GTE = 16, + BRIG_ATOMIC_WAITTIMEOUT_EQ = 17, + BRIG_ATOMIC_WAITTIMEOUT_NE = 18, + BRIG_ATOMIC_WAITTIMEOUT_LT = 19, + BRIG_ATOMIC_WAITTIMEOUT_GTE = 20 +}; + +typedef uint8_t BrigCompareOperation8_t; +enum BrigCompareOperation { + BRIG_COMPARE_EQ = 0, + BRIG_COMPARE_NE = 1, + BRIG_COMPARE_LT = 2, + BRIG_COMPARE_LE = 3, + BRIG_COMPARE_GT = 4, + BRIG_COMPARE_GE = 5, + BRIG_COMPARE_EQU = 6, + BRIG_COMPARE_NEU = 7, + BRIG_COMPARE_LTU = 8, + BRIG_COMPARE_LEU = 9, + BRIG_COMPARE_GTU = 10, + BRIG_COMPARE_GEU = 11, + BRIG_COMPARE_NUM = 12, + BRIG_COMPARE_NAN = 13, + BRIG_COMPARE_SEQ = 14, + BRIG_COMPARE_SNE = 15, + BRIG_COMPARE_SLT = 16, + BRIG_COMPARE_SLE = 17, + BRIG_COMPARE_SGT = 18, + BRIG_COMPARE_SGE = 19, + BRIG_COMPARE_SGEU = 20, + BRIG_COMPARE_SEQU = 21, + BRIG_COMPARE_SNEU = 22, + BRIG_COMPARE_SLTU = 23, + BRIG_COMPARE_SLEU = 24, + BRIG_COMPARE_SNUM = 25, + BRIG_COMPARE_SNAN = 26, + BRIG_COMPARE_SGTU = 27 +}; + +typedef uint16_t BrigControlDirective16_t; +enum BrigControlDirective { + BRIG_CONTROL_NONE = 0, + BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1, + BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2, + BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3, + BRIG_CONTROL_MAXFLATGRIDSIZE = 4, + BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5, + BRIG_CONTROL_REQUIREDDIM = 6, + BRIG_CONTROL_REQUIREDGRIDSIZE = 7, + BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8, + BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9 +}; + +typedef uint8_t BrigExecutableModifier8_t; +enum BrigExecutableModifierMask { + BRIG_EXECUTABLE_DEFINITION = 1 +}; + +typedef uint8_t BrigImageChannelOrder8_t; +enum BrigImageChannelOrder { + BRIG_CHANNEL_ORDER_A = 0, + BRIG_CHANNEL_ORDER_R = 1, + BRIG_CHANNEL_ORDER_RX = 2, + BRIG_CHANNEL_ORDER_RG = 3, + BRIG_CHANNEL_ORDER_RGX = 4, + BRIG_CHANNEL_ORDER_RA = 5, + BRIG_CHANNEL_ORDER_RGB = 6, + BRIG_CHANNEL_ORDER_RGBX = 7, + BRIG_CHANNEL_ORDER_RGBA = 8, + BRIG_CHANNEL_ORDER_BGRA = 9, + BRIG_CHANNEL_ORDER_ARGB = 10, + BRIG_CHANNEL_ORDER_ABGR = 11, + BRIG_CHANNEL_ORDER_SRGB = 12, + BRIG_CHANNEL_ORDER_SRGBX = 13, + BRIG_CHANNEL_ORDER_SRGBA = 14, + BRIG_CHANNEL_ORDER_SBGRA = 15, + BRIG_CHANNEL_ORDER_INTENSITY = 16, + BRIG_CHANNEL_ORDER_LUMINANCE = 17, + BRIG_CHANNEL_ORDER_DEPTH = 18, + BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19, + + BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigImageChannelType8_t; +enum BrigImageChannelType { + BRIG_CHANNEL_TYPE_SNORM_INT8 = 0, + BRIG_CHANNEL_TYPE_SNORM_INT16 = 1, + BRIG_CHANNEL_TYPE_UNORM_INT8 = 2, + BRIG_CHANNEL_TYPE_UNORM_INT16 = 3, + BRIG_CHANNEL_TYPE_UNORM_INT24 = 4, + BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7, + BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8, + BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9, + BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10, + BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + BRIG_CHANNEL_TYPE_HALF_FLOAT = 14, + BRIG_CHANNEL_TYPE_FLOAT = 15, + + BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigImageGeometry8_t; +enum BrigImageGeometry { + BRIG_GEOMETRY_1D = 0, + BRIG_GEOMETRY_2D = 1, + BRIG_GEOMETRY_3D = 2, + BRIG_GEOMETRY_1DA = 3, + BRIG_GEOMETRY_2DA = 4, + BRIG_GEOMETRY_1DB = 5, + BRIG_GEOMETRY_2DDEPTH = 6, + BRIG_GEOMETRY_2DADEPTH = 7, + + BRIG_GEOMETRY_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigImageQuery8_t; +enum BrigImageQuery { + BRIG_IMAGE_QUERY_WIDTH = 0, + BRIG_IMAGE_QUERY_HEIGHT = 1, + BRIG_IMAGE_QUERY_DEPTH = 2, + BRIG_IMAGE_QUERY_ARRAY = 3, + BRIG_IMAGE_QUERY_CHANNELORDER = 4, + BRIG_IMAGE_QUERY_CHANNELTYPE = 5, + + BRIG_IMAGE_QUERY_FIRST_USER_DEFINED = 6 +}; + +typedef uint8_t BrigLinkage8_t; +enum BrigLinkage { + BRIG_LINKAGE_NONE = 0, + BRIG_LINKAGE_PROGRAM = 1, + BRIG_LINKAGE_MODULE = 2, + BRIG_LINKAGE_FUNCTION = 3, + BRIG_LINKAGE_ARG = 4 +}; + +typedef uint8_t BrigMachineModel8_t; +enum BrigMachineModel { + BRIG_MACHINE_SMALL = 0, + BRIG_MACHINE_LARGE = 1, +}; + +typedef uint8_t BrigMemoryModifier8_t; +enum BrigMemoryModifierMask { + BRIG_MEMORY_CONST = 1 +}; + +typedef uint8_t BrigMemoryOrder8_t; +enum BrigMemoryOrder { + BRIG_MEMORY_ORDER_NONE = 0, + BRIG_MEMORY_ORDER_RELAXED = 1, + BRIG_MEMORY_ORDER_SC_ACQUIRE = 2, + BRIG_MEMORY_ORDER_SC_RELEASE = 3, + BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4, +}; + +typedef uint8_t BrigMemoryScope8_t; +enum BrigMemoryScope { + BRIG_MEMORY_SCOPE_NONE = 0, + BRIG_MEMORY_SCOPE_WORKITEM = 1, + BRIG_MEMORY_SCOPE_WAVEFRONT = 2, + BRIG_MEMORY_SCOPE_WORKGROUP = 3, + BRIG_MEMORY_SCOPE_AGENT = 4, + BRIG_MEMORY_SCOPE_SYSTEM = 5, +}; + +typedef uint16_t BrigOpcode16_t; +enum BrigOpcode { + BRIG_OPCODE_NOP = 0, + BRIG_OPCODE_ABS = 1, + BRIG_OPCODE_ADD = 2, + BRIG_OPCODE_BORROW = 3, + BRIG_OPCODE_CARRY = 4, + BRIG_OPCODE_CEIL = 5, + BRIG_OPCODE_COPYSIGN = 6, + BRIG_OPCODE_DIV = 7, + BRIG_OPCODE_FLOOR = 8, + BRIG_OPCODE_FMA = 9, + BRIG_OPCODE_FRACT = 10, + BRIG_OPCODE_MAD = 11, + BRIG_OPCODE_MAX = 12, + BRIG_OPCODE_MIN = 13, + BRIG_OPCODE_MUL = 14, + BRIG_OPCODE_MULHI = 15, + BRIG_OPCODE_NEG = 16, + BRIG_OPCODE_REM = 17, + BRIG_OPCODE_RINT = 18, + BRIG_OPCODE_SQRT = 19, + BRIG_OPCODE_SUB = 20, + BRIG_OPCODE_TRUNC = 21, + BRIG_OPCODE_MAD24 = 22, + BRIG_OPCODE_MAD24HI = 23, + BRIG_OPCODE_MUL24 = 24, + BRIG_OPCODE_MUL24HI = 25, + BRIG_OPCODE_SHL = 26, + BRIG_OPCODE_SHR = 27, + BRIG_OPCODE_AND = 28, + BRIG_OPCODE_NOT = 29, + BRIG_OPCODE_OR = 30, + BRIG_OPCODE_POPCOUNT = 31, + BRIG_OPCODE_XOR = 32, + BRIG_OPCODE_BITEXTRACT = 33, + BRIG_OPCODE_BITINSERT = 34, + BRIG_OPCODE_BITMASK = 35, + BRIG_OPCODE_BITREV = 36, + BRIG_OPCODE_BITSELECT = 37, + BRIG_OPCODE_FIRSTBIT = 38, + BRIG_OPCODE_LASTBIT = 39, + BRIG_OPCODE_COMBINE = 40, + BRIG_OPCODE_EXPAND = 41, + BRIG_OPCODE_LDA = 42, + BRIG_OPCODE_MOV = 43, + BRIG_OPCODE_SHUFFLE = 44, + BRIG_OPCODE_UNPACKHI = 45, + BRIG_OPCODE_UNPACKLO = 46, + BRIG_OPCODE_PACK = 47, + BRIG_OPCODE_UNPACK = 48, + BRIG_OPCODE_CMOV = 49, + BRIG_OPCODE_CLASS = 50, + BRIG_OPCODE_NCOS = 51, + BRIG_OPCODE_NEXP2 = 52, + BRIG_OPCODE_NFMA = 53, + BRIG_OPCODE_NLOG2 = 54, + BRIG_OPCODE_NRCP = 55, + BRIG_OPCODE_NRSQRT = 56, + BRIG_OPCODE_NSIN = 57, + BRIG_OPCODE_NSQRT = 58, + BRIG_OPCODE_BITALIGN = 59, + BRIG_OPCODE_BYTEALIGN = 60, + BRIG_OPCODE_PACKCVT = 61, + BRIG_OPCODE_UNPACKCVT = 62, + BRIG_OPCODE_LERP = 63, + BRIG_OPCODE_SAD = 64, + BRIG_OPCODE_SADHI = 65, + BRIG_OPCODE_SEGMENTP = 66, + BRIG_OPCODE_FTOS = 67, + BRIG_OPCODE_STOF = 68, + BRIG_OPCODE_CMP = 69, + BRIG_OPCODE_CVT = 70, + BRIG_OPCODE_LD = 71, + BRIG_OPCODE_ST = 72, + BRIG_OPCODE_ATOMIC = 73, + BRIG_OPCODE_ATOMICNORET = 74, + BRIG_OPCODE_SIGNAL = 75, + BRIG_OPCODE_SIGNALNORET = 76, + BRIG_OPCODE_MEMFENCE = 77, + BRIG_OPCODE_RDIMAGE = 78, + BRIG_OPCODE_LDIMAGE = 79, + BRIG_OPCODE_STIMAGE = 80, + BRIG_OPCODE_IMAGEFENCE = 81, + BRIG_OPCODE_QUERYIMAGE = 82, + BRIG_OPCODE_QUERYSAMPLER = 83, + BRIG_OPCODE_CBR = 84, + BRIG_OPCODE_BR = 85, + BRIG_OPCODE_SBR = 86, + BRIG_OPCODE_BARRIER = 87, + BRIG_OPCODE_WAVEBARRIER = 88, + BRIG_OPCODE_ARRIVEFBAR = 89, + BRIG_OPCODE_INITFBAR = 90, + BRIG_OPCODE_JOINFBAR = 91, + BRIG_OPCODE_LEAVEFBAR = 92, + BRIG_OPCODE_RELEASEFBAR = 93, + BRIG_OPCODE_WAITFBAR = 94, + BRIG_OPCODE_LDF = 95, + BRIG_OPCODE_ACTIVELANECOUNT = 96, + BRIG_OPCODE_ACTIVELANEID = 97, + BRIG_OPCODE_ACTIVELANEMASK = 98, + BRIG_OPCODE_ACTIVELANEPERMUTE = 99, + BRIG_OPCODE_CALL = 100, + BRIG_OPCODE_SCALL = 101, + BRIG_OPCODE_ICALL = 102, + BRIG_OPCODE_RET = 103, + BRIG_OPCODE_ALLOCA = 104, + BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105, + BRIG_OPCODE_CURRENTWORKITEMFLATID = 106, + BRIG_OPCODE_DIM = 107, + BRIG_OPCODE_GRIDGROUPS = 108, + BRIG_OPCODE_GRIDSIZE = 109, + BRIG_OPCODE_PACKETCOMPLETIONSIG = 110, + BRIG_OPCODE_PACKETID = 111, + BRIG_OPCODE_WORKGROUPID = 112, + BRIG_OPCODE_WORKGROUPSIZE = 113, + BRIG_OPCODE_WORKITEMABSID = 114, + BRIG_OPCODE_WORKITEMFLATABSID = 115, + BRIG_OPCODE_WORKITEMFLATID = 116, + BRIG_OPCODE_WORKITEMID = 117, + BRIG_OPCODE_CLEARDETECTEXCEPT = 118, + BRIG_OPCODE_GETDETECTEXCEPT = 119, + BRIG_OPCODE_SETDETECTEXCEPT = 120, + BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121, + BRIG_OPCODE_CASQUEUEWRITEINDEX = 122, + BRIG_OPCODE_LDQUEUEREADINDEX = 123, + BRIG_OPCODE_LDQUEUEWRITEINDEX = 124, + BRIG_OPCODE_STQUEUEREADINDEX = 125, + BRIG_OPCODE_STQUEUEWRITEINDEX = 126, + BRIG_OPCODE_CLOCK = 127, + BRIG_OPCODE_CUID = 128, + BRIG_OPCODE_DEBUGTRAP = 129, + BRIG_OPCODE_GROUPBASEPTR = 130, + BRIG_OPCODE_KERNARGBASEPTR = 131, + BRIG_OPCODE_LANEID = 132, + BRIG_OPCODE_MAXCUID = 133, + BRIG_OPCODE_MAXWAVEID = 134, + BRIG_OPCODE_NULLPTR = 135, + BRIG_OPCODE_WAVEID = 136, + + BRIG_OPCODE_FIRST_USER_DEFINED = 32768, +}; + +typedef uint8_t BrigPack8_t; +enum BrigPack { + BRIG_PACK_NONE = 0, + BRIG_PACK_PP = 1, + BRIG_PACK_PS = 2, + BRIG_PACK_SP = 3, + BRIG_PACK_SS = 4, + BRIG_PACK_S = 5, + BRIG_PACK_P = 6, + BRIG_PACK_PPSAT = 7, + BRIG_PACK_PSSAT = 8, + BRIG_PACK_SPSAT = 9, + BRIG_PACK_SSSAT = 10, + BRIG_PACK_SSAT = 11, + BRIG_PACK_PSAT = 12 +}; + +typedef uint8_t BrigProfile8_t; +enum BrigProfile { + BRIG_PROFILE_BASE = 0, + BRIG_PROFILE_FULL = 1, +}; + +typedef uint16_t BrigRegisterKind16_t; +enum BrigRegisterKind { + BRIG_REGISTER_KIND_CONTROL = 0, + BRIG_REGISTER_KIND_SINGLE = 1, + BRIG_REGISTER_KIND_DOUBLE = 2, + BRIG_REGISTER_KIND_QUAD = 3 +}; + +typedef uint8_t BrigRound8_t; +enum BrigRound { + BRIG_ROUND_NONE = 0, + BRIG_ROUND_FLOAT_DEFAULT = 1, + BRIG_ROUND_FLOAT_NEAR_EVEN = 2, + BRIG_ROUND_FLOAT_ZERO = 3, + BRIG_ROUND_FLOAT_PLUS_INFINITY = 4, + BRIG_ROUND_FLOAT_MINUS_INFINITY = 5, + BRIG_ROUND_INTEGER_NEAR_EVEN = 6, + BRIG_ROUND_INTEGER_ZERO = 7, + BRIG_ROUND_INTEGER_PLUS_INFINITY = 8, + BRIG_ROUND_INTEGER_MINUS_INFINITY = 9, + BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10, + BRIG_ROUND_INTEGER_ZERO_SAT = 11, + BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12, + BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13, + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14, + BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15, + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16, + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17, + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18, + BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19, + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20, + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21 +}; + +typedef uint8_t BrigSamplerAddressing8_t; +enum BrigSamplerAddressing { + BRIG_ADDRESSING_UNDEFINED = 0, + BRIG_ADDRESSING_CLAMP_TO_EDGE = 1, + BRIG_ADDRESSING_CLAMP_TO_BORDER = 2, + BRIG_ADDRESSING_REPEAT = 3, + BRIG_ADDRESSING_MIRRORED_REPEAT = 4, + + BRIG_ADDRESSING_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigSamplerCoordNormalization8_t; +enum BrigSamplerCoordNormalization { + BRIG_COORD_UNNORMALIZED = 0, + BRIG_COORD_NORMALIZED = 1 +}; + +typedef uint8_t BrigSamplerFilter8_t; +enum BrigSamplerFilter { + BRIG_FILTER_NEAREST = 0, + BRIG_FILTER_LINEAR = 1, + + BRIG_FILTER_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigSamplerQuery8_t; +enum BrigSamplerQuery { + BRIG_SAMPLER_QUERY_ADDRESSING = 0, + BRIG_SAMPLER_QUERY_COORD = 1, + BRIG_SAMPLER_QUERY_FILTER = 2 +}; + +typedef uint32_t BrigSectionIndex32_t; +enum BrigSectionIndex { + BRIG_SECTION_INDEX_DATA = 0, + BRIG_SECTION_INDEX_CODE = 1, + BRIG_SECTION_INDEX_OPERAND = 2, + + BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3, +}; + +typedef uint8_t BrigSegCvtModifier8_t; +enum BrigSegCvtModifierMask { + BRIG_SEG_CVT_NONULL = 1 +}; + +typedef uint8_t BrigSegment8_t; +enum BrigSegment { + BRIG_SEGMENT_NONE = 0, + BRIG_SEGMENT_FLAT = 1, + BRIG_SEGMENT_GLOBAL = 2, + BRIG_SEGMENT_READONLY = 3, + BRIG_SEGMENT_KERNARG = 4, + BRIG_SEGMENT_GROUP = 5, + BRIG_SEGMENT_PRIVATE = 6, + BRIG_SEGMENT_SPILL = 7, + BRIG_SEGMENT_ARG = 8, + + BRIG_SEGMENT_FIRST_USER_DEFINED = 128 +}; + +enum { + BRIG_TYPE_BASE_SIZE = 5, + BRIG_TYPE_PACK_SIZE = 2, + BRIG_TYPE_ARRAY_SIZE = 1, + + BRIG_TYPE_BASE_SHIFT = 0, + BRIG_TYPE_PACK_SHIFT = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE, + BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE, + + BRIG_TYPE_BASE_MASK = ((1 << BRIG_TYPE_BASE_SIZE) - 1) << BRIG_TYPE_BASE_SHIFT, + BRIG_TYPE_PACK_MASK = ((1 << BRIG_TYPE_PACK_SIZE) - 1) << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT, + + BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_32 = 1 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_64 = 2 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_128 = 3 << BRIG_TYPE_PACK_SHIFT, + + BRIG_TYPE_ARRAY = 1 << BRIG_TYPE_ARRAY_SHIFT +}; + +typedef uint16_t BrigType16_t; +enum BrigType { + BRIG_TYPE_NONE = 0, + BRIG_TYPE_U8 = 1, + BRIG_TYPE_U16 = 2, + BRIG_TYPE_U32 = 3, + BRIG_TYPE_U64 = 4, + BRIG_TYPE_S8 = 5, + BRIG_TYPE_S16 = 6, + BRIG_TYPE_S32 = 7, + BRIG_TYPE_S64 = 8, + BRIG_TYPE_F16 = 9, + BRIG_TYPE_F32 = 10, + BRIG_TYPE_F64 = 11, + BRIG_TYPE_B1 = 12, + BRIG_TYPE_B8 = 13, + BRIG_TYPE_B16 = 14, + BRIG_TYPE_B32 = 15, + BRIG_TYPE_B64 = 16, + BRIG_TYPE_B128 = 17, + BRIG_TYPE_SAMP = 18, + BRIG_TYPE_ROIMG = 19, + BRIG_TYPE_WOIMG = 20, + BRIG_TYPE_RWIMG = 21, + BRIG_TYPE_SIG32 = 22, + BRIG_TYPE_SIG64 = 23, + + BRIG_TYPE_U8X4 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_32, + BRIG_TYPE_U8X8 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_64, + BRIG_TYPE_U8X16 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_128, + BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32, + BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64, + BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128, + BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64, + BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128, + BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S8X4 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_32, + BRIG_TYPE_S8X8 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_64, + BRIG_TYPE_S8X16 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32, + BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64, + BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64, + BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128, + BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32, + BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64, + BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128, + BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64, + BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128, + BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128, + + BRIG_TYPE_U8_ARRAY = BRIG_TYPE_U8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16_ARRAY = BRIG_TYPE_U16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U32_ARRAY = BRIG_TYPE_U32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U64_ARRAY = BRIG_TYPE_U64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8_ARRAY = BRIG_TYPE_S8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16_ARRAY = BRIG_TYPE_S16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S32_ARRAY = BRIG_TYPE_S32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S64_ARRAY = BRIG_TYPE_S64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16_ARRAY = BRIG_TYPE_F16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F32_ARRAY = BRIG_TYPE_F32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F64_ARRAY = BRIG_TYPE_F64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B8_ARRAY = BRIG_TYPE_B8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B16_ARRAY = BRIG_TYPE_B16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B32_ARRAY = BRIG_TYPE_B32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B64_ARRAY = BRIG_TYPE_B64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B128_ARRAY = BRIG_TYPE_B128 | BRIG_TYPE_ARRAY, + BRIG_TYPE_SAMP_ARRAY = BRIG_TYPE_SAMP | BRIG_TYPE_ARRAY, + BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY, + BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY, + BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY, + BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U8X4_ARRAY = BRIG_TYPE_U8X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U8X8_ARRAY = BRIG_TYPE_U8X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8X4_ARRAY = BRIG_TYPE_S8X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8X8_ARRAY = BRIG_TYPE_S8X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY, +}; + +typedef uint8_t BrigVariableModifier8_t; +enum BrigVariableModifierMask { + BRIG_VARIABLE_DEFINITION = 1, + BRIG_VARIABLE_CONST = 2 +}; + +typedef uint8_t BrigWidth8_t; +enum BrigWidth { + BRIG_WIDTH_NONE = 0, + BRIG_WIDTH_1 = 1, + BRIG_WIDTH_2 = 2, + BRIG_WIDTH_4 = 3, + BRIG_WIDTH_8 = 4, + BRIG_WIDTH_16 = 5, + BRIG_WIDTH_32 = 6, + BRIG_WIDTH_64 = 7, + BRIG_WIDTH_128 = 8, + BRIG_WIDTH_256 = 9, + BRIG_WIDTH_512 = 10, + BRIG_WIDTH_1024 = 11, + BRIG_WIDTH_2048 = 12, + BRIG_WIDTH_4096 = 13, + BRIG_WIDTH_8192 = 14, + BRIG_WIDTH_16384 = 15, + BRIG_WIDTH_32768 = 16, + BRIG_WIDTH_65536 = 17, + BRIG_WIDTH_131072 = 18, + BRIG_WIDTH_262144 = 19, + BRIG_WIDTH_524288 = 20, + BRIG_WIDTH_1048576 = 21, + BRIG_WIDTH_2097152 = 22, + BRIG_WIDTH_4194304 = 23, + BRIG_WIDTH_8388608 = 24, + BRIG_WIDTH_16777216 = 25, + BRIG_WIDTH_33554432 = 26, + BRIG_WIDTH_67108864 = 27, + BRIG_WIDTH_134217728 = 28, + BRIG_WIDTH_268435456 = 29, + BRIG_WIDTH_536870912 = 30, + BRIG_WIDTH_1073741824 = 31, + BRIG_WIDTH_2147483648 = 32, + BRIG_WIDTH_WAVESIZE = 33, + BRIG_WIDTH_ALL = 34, +}; + +struct BrigUInt64 { + uint32_t lo; + uint32_t hi; +}; + +struct BrigBase { + uint16_t byteCount; + BrigKind16_t kind; +}; + +struct BrigData { + uint32_t byteCount; + uint8_t bytes[1]; +}; + +struct BrigDirectiveArgBlock { + BrigBase base; +}; + +struct BrigDirectiveComment { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveControl { + BrigBase base; + BrigControlDirective16_t control; + uint16_t reserved; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveExecutable { + BrigBase base; + BrigDataOffsetString32_t name; + uint16_t outArgCount; + uint16_t inArgCount; + BrigCodeOffset32_t firstInArg; + BrigCodeOffset32_t firstCodeBlockEntry; + BrigCodeOffset32_t nextModuleEntry; + BrigExecutableModifier8_t modifier; + BrigLinkage8_t linkage; + uint16_t reserved; +}; + +struct BrigDirectiveExtension { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveFbarrier { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVariableModifier8_t modifier; + BrigLinkage8_t linkage; + uint16_t reserved; +}; + +struct BrigDirectiveLabel { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveLoc { + BrigBase base; + BrigDataOffsetString32_t filename; + uint32_t line; + uint32_t column; +}; + +struct BrigDirectiveNone { + BrigBase base; +}; + +struct BrigDirectivePragma { + BrigBase base; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveVariable { + BrigBase base; + BrigDataOffsetString32_t name; + BrigOperandOffset32_t init; + BrigType16_t type; + BrigSegment8_t segment; + BrigAlignment8_t align; + BrigUInt64 dim; + BrigVariableModifier8_t modifier; + BrigLinkage8_t linkage; + BrigAllocation8_t allocation; + uint8_t reserved; +}; + +struct BrigDirectiveModule { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVersion32_t hsailMajor; + BrigVersion32_t hsailMinor; + BrigProfile8_t profile; + BrigMachineModel8_t machineModel; + BrigRound8_t defaultFloatRound; + uint8_t reserved; +}; + +struct BrigInstBase { + BrigBase base; + BrigOpcode16_t opcode; + BrigType16_t type; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigInstAddr { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; +}; + +struct BrigInstAtomic { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t memoryScope; + BrigAtomicOperation8_t atomicOperation; + uint8_t equivClass; + uint8_t reserved[3]; +}; + +struct BrigInstBasic { + BrigInstBase base; +}; + +struct BrigInstBr { + BrigInstBase base; + BrigWidth8_t width; + uint8_t reserved[3]; +}; + +struct BrigInstCmp { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier8_t modifier; + BrigCompareOperation8_t compare; + BrigPack8_t pack; + uint8_t reserved[3]; +}; + +struct BrigInstCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier8_t modifier; + BrigRound8_t round; +}; + +struct BrigInstImage { + BrigInstBase base; + BrigType16_t imageType; + BrigType16_t coordType; + BrigImageGeometry8_t geometry; + uint8_t equivClass; + uint16_t reserved; +}; + +struct BrigInstLane { + BrigInstBase base; + BrigType16_t sourceType; + BrigWidth8_t width; + uint8_t reserved; +}; + +struct BrigInstMem { + BrigInstBase base; + BrigSegment8_t segment; + BrigAlignment8_t align; + uint8_t equivClass; + BrigWidth8_t width; + BrigMemoryModifier8_t modifier; + uint8_t reserved[3]; +}; + +struct BrigInstMemFence { + BrigInstBase base; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t globalSegmentMemoryScope; + BrigMemoryScope8_t groupSegmentMemoryScope; + BrigMemoryScope8_t imageSegmentMemoryScope; +}; + +struct BrigInstMod { + BrigInstBase base; + BrigAluModifier8_t modifier; + BrigRound8_t round; + BrigPack8_t pack; + uint8_t reserved; +}; + +struct BrigInstQueryImage { + BrigInstBase base; + BrigType16_t imageType; + BrigImageGeometry8_t geometry; + BrigImageQuery8_t query; +}; + +struct BrigInstQuerySampler { + BrigInstBase base; + BrigSamplerQuery8_t query; + uint8_t reserved[3]; +}; + +struct BrigInstQueue { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + uint16_t reserved; +}; + +struct BrigInstSeg { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; +}; + +struct BrigInstSegCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigSegment8_t segment; + BrigSegCvtModifier8_t modifier; +}; + +struct BrigInstSignal { + BrigInstBase base; + BrigType16_t signalType; + BrigMemoryOrder8_t memoryOrder; + BrigAtomicOperation8_t signalOperation; +}; + +struct BrigInstSourceType { + BrigInstBase base; + BrigType16_t sourceType; + uint16_t reserved; +}; + +struct BrigOperandAddress { + BrigBase base; + BrigCodeOffset32_t symbol; + BrigOperandOffset32_t reg; + BrigUInt64 offset; +}; + +struct BrigOperandAlign { + BrigBase base; + BrigAlignment8_t align; + uint8_t reserved[3]; +}; + +struct BrigOperandCodeList { + BrigBase base; + BrigDataOffsetCodeList32_t elements; +}; + +struct BrigOperandCodeRef { + BrigBase base; + BrigCodeOffset32_t ref; +}; + +struct BrigOperandConstantBytes { + BrigBase base; + BrigType16_t type; + uint16_t reserved; + BrigDataOffsetString32_t bytes; +}; + +struct BrigOperandConstantOperandList { + BrigBase base; + BrigType16_t type; + uint16_t reserved; + BrigDataOffsetOperandList32_t elements; +}; + +struct BrigOperandConstantImage { + BrigBase base; + BrigType16_t type; + BrigImageGeometry8_t geometry; + BrigImageChannelOrder8_t channelOrder; + BrigImageChannelType8_t channelType; + uint8_t reserved[3]; + BrigUInt64 width; + BrigUInt64 height; + BrigUInt64 depth; + BrigUInt64 array; +}; + +struct BrigOperandOperandList { + BrigBase base; + BrigDataOffsetOperandList32_t elements; +}; + +struct BrigOperandRegister { + BrigBase base; + BrigRegisterKind16_t regKind; + uint16_t regNum; +}; + +struct BrigOperandConstantSampler { + BrigBase base; + BrigType16_t type; + BrigSamplerCoordNormalization8_t coord; + BrigSamplerFilter8_t filter; + BrigSamplerAddressing8_t addressing; + uint8_t reserved[3]; +}; + +struct BrigOperandString { + BrigBase base; + BrigDataOffsetString32_t string; +}; + +struct BrigOperandWavesize { + BrigBase base; +}; + +typedef uint32_t BrigExceptions32_t; +enum BrigExceptionsMask { + BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0, + BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1, + BRIG_EXCEPTIONS_OVERFLOW = 1 << 2, + BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3, + BRIG_EXCEPTIONS_INEXACT = 1 << 4, + + BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16 +}; + +struct BrigSectionHeader { + uint64_t byteCount; + uint32_t headerByteCount; + uint32_t nameLength; + uint8_t name[1]; +}; + +struct BrigModuleHeader { + char identification[8]; + BrigVersion32_t brigMajor; + BrigVersion32_t brigMinor; + uint64_t byteCount; + uint8_t hash[64]; + uint32_t reserved; + uint32_t sectionCount; + uint64_t sectionIndex; +}; + +typedef BrigModuleHeader* BrigModule_t; + +#ifdef __cplusplus +} +#endif /*__cplusplus*/ + +#endif // defined(INCLUDED_BRIG_H) diff --git a/hsa/amd_hsa_common.h b/hsa/amd_hsa_common.h new file mode 100644 index 0000000000..7c4ed3eea4 --- /dev/null +++ b/hsa/amd_hsa_common.h @@ -0,0 +1,91 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// The following set of header files provides definitions for AMD GPU +// Architecture: +// - amd_hsa_common.h +// - amd_hsa_elf.h +// - amd_hsa_kernel_code.h +// - amd_hsa_queue.h +// - amd_hsa_signal.h +// +// Refer to "HSA Application Binary Interface: AMD GPU Architecture" for more +// information. + +#ifndef AMD_HSA_COMMON_H +#define AMD_HSA_COMMON_H + +#include +#include + +// Descriptive version of the HSA Application Binary Interface. +#define AMD_HSA_ABI_VERSION "AMD GPU Architecture v0.35 (June 25, 2015)" + +// Alignment attribute that specifies a minimum alignment (in bytes) for +// variables of the specified type. +#if defined(__GNUC__) +# define __ALIGNED__(x) __attribute__((aligned(x))) +#elif defined(_MSC_VER) +# define __ALIGNED__(x) __declspec(align(x)) +#elif defined(RC_INVOKED) +# define __ALIGNED__(x) +#else +# error +#endif + +// Creates enumeration entries for packed types. Enumeration entries include +// bit shift amount, bit width, and bit mask. +#define AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width) \ + name##_SHIFT = (shift), \ + name##_WIDTH = (width), \ + name = (((1 << (width)) - 1) << (shift)) \ + +// Gets bits for specified mask from specified src packed instance. +#define AMD_HSA_BITS_GET(src, mask) \ + ((src & mask) >> mask ## _SHIFT) \ + +// Sets val bits for specified mask in specified dst packed instance. +#define AMD_HSA_BITS_SET(dst, mask, val) \ + dst &= (~(1 << mask##_SHIFT) & ~mask); \ + dst |= (((val) << mask##_SHIFT) & mask) \ + +#endif // AMD_HSA_COMMON_H diff --git a/hsa/amd_hsa_elf.h b/hsa/amd_hsa_elf.h new file mode 100644 index 0000000000..2b6c4c9672 --- /dev/null +++ b/hsa/amd_hsa_elf.h @@ -0,0 +1,467 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Undefine the macro in case it is defined in the system elf.h. +#undef EM_AMDGPU + +#ifndef AMD_HSA_ELF_H +#define AMD_HSA_ELF_H + +// AMD GPU Specific ELF Header Enumeration Values. +// +// Values are copied from LLVM BinaryFormat/ELF.h . This file also contains +// code object V1 defintions which are not part of the LLVM header. Code object +// V1 was only supported by the Finalizer which is now deprecated and removed. +// +// TODO: Deprecate and remove V1 support and replace this header with using the +// LLVM header. +namespace ELF { + +// Machine architectures +// See current registered ELF machine architectures at: +// http://www.uxsglobal.com/developers/gabi/latest/ch4.eheader.html +enum { + EM_AMDGPU = 224, // AMD GPU architecture +}; + +// OS ABI identification. +enum { + ELFOSABI_AMDGPU_HSA = 64, // AMD HSA runtime +}; + +// AMDGPU OS ABI Version identification. +enum { + // ELFABIVERSION_AMDGPU_HSA_V1 does not exist because OS ABI identification + // was never defined for V1. + ELFABIVERSION_AMDGPU_HSA_V2 = 0, + ELFABIVERSION_AMDGPU_HSA_V3 = 1, + ELFABIVERSION_AMDGPU_HSA_V4 = 2, + ELFABIVERSION_AMDGPU_HSA_V5 = 3, + ELFABIVERSION_AMDGPU_HSA_V6 = 4, +}; + +// AMDGPU specific e_flags. +enum : unsigned { + // Processor selection mask for EF_AMDGPU_MACH_* values. + EF_AMDGPU_MACH = 0x0ff, + + // Not specified processor. + EF_AMDGPU_MACH_NONE = 0x000, + + // AMDGCN-based processors. + // clang-format off + EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020, + EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021, + EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022, + EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023, + EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024, + EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025, + EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27 = 0x027, + EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028, + EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029, + EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a, + EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b, + EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c, + EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d, + EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e, + EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, + EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030, + EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031, + EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032, + EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, + EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034, + EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035, + EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036, + EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037, + EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038, + EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039, + EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a, + EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b, + EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c, + EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d, + EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e, + EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, + EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040, + EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, + EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042, + EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043, + EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044, + EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045, + EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046, + EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047, + EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49 = 0x049, + EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a, + EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b, + EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d, + EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e, + EF_AMDGPU_MACH_AMDGCN_GFX950 = 0x04f, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050, + EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051, + EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052, + EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC = 0x053, + EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC = 0x054, + EF_AMDGPU_MACH_AMDGCN_GFX1152 = 0x055, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X56 = 0x056, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57 = 0x057, + EF_AMDGPU_MACH_AMDGCN_GFX1153 = 0x058, + EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC = 0x059, + EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC = 0x05f, + // clang-format on + + // First/last AMDGCN-based processors. + EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, + + // Indicates if the "xnack" target feature is enabled for all code contained + // in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2. + EF_AMDGPU_FEATURE_XNACK_V2 = 0x01, + // Indicates if the trap handler is enabled for all code contained + // in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2. + EF_AMDGPU_FEATURE_TRAP_HANDLER_V2 = 0x02, + + // Indicates if the "xnack" target feature is enabled for all code contained + // in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3. + EF_AMDGPU_FEATURE_XNACK_V3 = 0x100, + // Indicates if the "sramecc" target feature is enabled for all code + // contained in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3. + EF_AMDGPU_FEATURE_SRAMECC_V3 = 0x200, + + // XNACK selection mask for EF_AMDGPU_FEATURE_XNACK_* values. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4. + EF_AMDGPU_FEATURE_XNACK_V4 = 0x300, + // XNACK is not supported. + EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000, + // XNACK is any/default/unspecified. + EF_AMDGPU_FEATURE_XNACK_ANY_V4 = 0x100, + // XNACK is off. + EF_AMDGPU_FEATURE_XNACK_OFF_V4 = 0x200, + // XNACK is on. + EF_AMDGPU_FEATURE_XNACK_ON_V4 = 0x300, + + // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4. + EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00, + // SRAMECC is not supported. + EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000, + // SRAMECC is any/default/unspecified. + EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 = 0x400, + // SRAMECC is off. + EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800, + // SRAMECC is on. + EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00, + + // Generic target versioning. This is contained in the list byte of EFLAGS. + EF_AMDGPU_GENERIC_VERSION = 0xff000000, + EF_AMDGPU_GENERIC_VERSION_OFFSET = 24, + EF_AMDGPU_GENERIC_VERSION_MIN = 1, + EF_AMDGPU_GENERIC_VERSION_MAX = 0xff, +}; + +// ELF Relocation types for AMDGPU. +enum : unsigned { + R_AMDGPU_ABS32_LO = 1, + R_AMDGPU_ABS32_HI = 2, + R_AMDGPU_ABS64 = 3, + R_AMDGPU_ABS32 = 6, + R_AMDGPU_RELATIVE64 = 13, +}; + +} // end namespace ELF + +// ELF Section Header Flag Enumeration Values. +#define SHF_AMDGPU_HSA_GLOBAL (0x00100000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_READONLY (0x00200000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_CODE (0x00400000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_AGENT (0x00800000 & SHF_MASKOS) + +// +typedef enum { + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM = 0, + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT = 1, + AMDGPU_HSA_SEGMENT_READONLY_AGENT = 2, + AMDGPU_HSA_SEGMENT_CODE_AGENT = 3, + AMDGPU_HSA_SEGMENT_LAST, +} amdgpu_hsa_elf_segment_t; + +// ELF Program Header Type Enumeration Values. +#define PT_AMDGPU_HSA_LOAD_GLOBAL_PROGRAM (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM) +#define PT_AMDGPU_HSA_LOAD_GLOBAL_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT) +#define PT_AMDGPU_HSA_LOAD_READONLY_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_READONLY_AGENT) +#define PT_AMDGPU_HSA_LOAD_CODE_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_CODE_AGENT) + +// ELF Symbol Type Enumeration Values. +#define STT_AMDGPU_HSA_KERNEL (STT_LOOS + 0) +#define STT_AMDGPU_HSA_INDIRECT_FUNCTION (STT_LOOS + 1) +#define STT_AMDGPU_HSA_METADATA (STT_LOOS + 2) + +// ELF Symbol Binding Enumeration Values. +#define STB_AMDGPU_HSA_EXTERNAL (STB_LOOS + 0) + +// ELF Symbol Other Information Creation/Retrieval. +#define ELF64_ST_AMDGPU_ALLOCATION(o) (((o) >> 2) & 0x3) +#define ELF64_ST_AMDGPU_FLAGS(o) ((o) >> 4) +#define ELF64_ST_AMDGPU_OTHER(f, a, v) (((f) << 4) + (((a) & 0x3) << 2) + ((v) & 0x3)) + +typedef enum { + AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT = 0, + AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM = 1, + AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT = 2, + AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT = 3, + AMDGPU_HSA_SYMBOL_ALLOCATION_LAST, +} amdgpu_hsa_symbol_allocation_t; + +// ELF Symbol Allocation Enumeration Values. +#define STA_AMDGPU_HSA_DEFAULT AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT +#define STA_AMDGPU_HSA_GLOBAL_PROGRAM AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM +#define STA_AMDGPU_HSA_GLOBAL_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT +#define STA_AMDGPU_HSA_READONLY_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT + +typedef enum { + AMDGPU_HSA_SYMBOL_FLAG_DEFAULT = 0, + AMDGPU_HSA_SYMBOL_FLAG_CONST = 1, + AMDGPU_HSA_SYMBOL_FLAG_LAST, +} amdgpu_hsa_symbol_flag_t; + +// ELF Symbol Flag Enumeration Values. +#define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST + +// Legacy/V1 AMD GPU Relocation Type Enumeration Values. +#define R_AMDGPU_V1_NONE 0 +#define R_AMDGPU_V1_32_LOW 1 +#define R_AMDGPU_V1_32_HIGH 2 +#define R_AMDGPU_V1_64 3 +#define R_AMDGPU_V1_INIT_SAMPLER 4 +#define R_AMDGPU_V1_INIT_IMAGE 5 +#define R_AMDGPU_V1_RELATIVE64 13 + +// AMD GPU Note Type Enumeration Values. +#define NT_AMD_HSA_CODE_OBJECT_VERSION 1 +#define NT_AMD_HSA_HSAIL 2 +#define NT_AMD_HSA_ISA_VERSION 3 +#define NT_AMD_HSA_PRODUCER 4 +#define NT_AMD_HSA_PRODUCER_OPTIONS 5 +#define NT_AMD_HSA_EXTENSION 6 +#define NT_AMD_HSA_ISA_NAME 11 +/* AMDGPU snapshots of runtime, agent and queues state for use in core dump */ +#define NT_AMDGPU_CORE_STATE 33 +#define NT_AMD_HSA_HLDEBUG_DEBUG 101 +#define NT_AMD_HSA_HLDEBUG_TARGET 102 + +// AMD GPU Metadata Kind Enumeration Values. +typedef uint16_t amdgpu_hsa_metadata_kind16_t; +typedef enum { + AMDGPU_HSA_METADATA_KIND_NONE = 0, + AMDGPU_HSA_METADATA_KIND_INIT_SAMP = 1, + AMDGPU_HSA_METADATA_KIND_INIT_ROIMG = 2, + AMDGPU_HSA_METADATA_KIND_INIT_WOIMG = 3, + AMDGPU_HSA_METADATA_KIND_INIT_RWIMG = 4 +} amdgpu_hsa_metadata_kind_t; + +// AMD GPU Sampler Coordinate Normalization Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_coord8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_COORD_UNNORMALIZED = 0, + AMDGPU_HSA_SAMPLER_COORD_NORMALIZED = 1 +} amdgpu_hsa_sampler_coord_t; + +// AMD GPU Sampler Filter Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_filter8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_FILTER_NEAREST = 0, + AMDGPU_HSA_SAMPLER_FILTER_LINEAR = 1 +} amdgpu_hsa_sampler_filter_t; + +// AMD GPU Sampler Addressing Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_addressing8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_ADDRESSING_UNDEFINED = 0, + AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_EDGE = 1, + AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_BORDER = 2, + AMDGPU_HSA_SAMPLER_ADDRESSING_REPEAT = 3, + AMDGPU_HSA_SAMPLER_ADDRESSING_MIRRORED_REPEAT = 4 +} amdgpu_hsa_sampler_addressing_t; + +// AMD GPU Sampler Descriptor. +typedef struct amdgpu_hsa_sampler_descriptor_s { + uint16_t size; + amdgpu_hsa_metadata_kind16_t kind; + amdgpu_hsa_sampler_coord8_t coord; + amdgpu_hsa_sampler_filter8_t filter; + amdgpu_hsa_sampler_addressing8_t addressing; + uint8_t reserved1; +} amdgpu_hsa_sampler_descriptor_t; + +// AMD GPU Image Geometry Enumeration Values. +typedef uint8_t amdgpu_hsa_image_geometry8_t; +typedef enum { + AMDGPU_HSA_IMAGE_GEOMETRY_1D = 0, + AMDGPU_HSA_IMAGE_GEOMETRY_2D = 1, + AMDGPU_HSA_IMAGE_GEOMETRY_3D = 2, + AMDGPU_HSA_IMAGE_GEOMETRY_1DA = 3, + AMDGPU_HSA_IMAGE_GEOMETRY_2DA = 4, + AMDGPU_HSA_IMAGE_GEOMETRY_1DB = 5, + AMDGPU_HSA_IMAGE_GEOMETRY_2DDEPTH = 6, + AMDGPU_HSA_IMAGE_GEOMETRY_2DADEPTH = 7 +} amdgpu_hsa_image_geometry_t; + +// AMD GPU Image Channel Order Enumeration Values. +typedef uint8_t amdgpu_hsa_image_channel_order8_t; +typedef enum { + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_A = 0, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_R = 1, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RX = 2, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RG = 3, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGX = 4, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RA = 5, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGB = 6, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBX = 7, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBA = 8, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_BGRA = 9, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ARGB = 10, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ABGR = 11, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGB = 12, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBX = 13, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBA = 14, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SBGRA = 15, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_INTENSITY = 16, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_LUMINANCE = 17, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH = 18, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19 +} amdgpu_hsa_image_channel_order_t; + +// AMD GPU Image Channel Type Enumeration Values. +typedef uint8_t amdgpu_hsa_image_channel_type8_t; +typedef enum { + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_555 = 5, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_565 = 6, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_INT_101010 = 7, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_FLOAT = 15 +} amdgpu_hsa_image_channel_type_t; + +// AMD GPU Image Descriptor. +typedef struct amdgpu_hsa_image_descriptor_s { + uint16_t size; + amdgpu_hsa_metadata_kind16_t kind; + amdgpu_hsa_image_geometry8_t geometry; + amdgpu_hsa_image_channel_order8_t channel_order; + amdgpu_hsa_image_channel_type8_t channel_type; + uint8_t reserved1; + uint64_t width; + uint64_t height; + uint64_t depth; + uint64_t array; +} amdgpu_hsa_image_descriptor_t; + +typedef struct amdgpu_hsa_note_code_object_version_s { + uint32_t major_version; + uint32_t minor_version; +} amdgpu_hsa_note_code_object_version_t; + +typedef struct amdgpu_hsa_note_hsail_s { + uint32_t hsail_major_version; + uint32_t hsail_minor_version; + uint8_t profile; + uint8_t machine_model; + uint8_t default_float_round; +} amdgpu_hsa_note_hsail_t; + +typedef struct amdgpu_hsa_note_isa_s { + uint16_t vendor_name_size; + uint16_t architecture_name_size; + uint32_t major; + uint32_t minor; + uint32_t stepping; + char vendor_and_architecture_name[1]; +} amdgpu_hsa_note_isa_t; + +typedef struct amdgpu_hsa_note_producer_s { + uint16_t producer_name_size; + uint16_t reserved; + uint32_t producer_major_version; + uint32_t producer_minor_version; + char producer_name[1]; +} amdgpu_hsa_note_producer_t; + +typedef struct amdgpu_hsa_note_producer_options_s { + uint16_t producer_options_size; + char producer_options[1]; +} amdgpu_hsa_note_producer_options_t; + +typedef enum { + AMDGPU_HSA_RODATA_GLOBAL_PROGRAM = 0, + AMDGPU_HSA_RODATA_GLOBAL_AGENT, + AMDGPU_HSA_RODATA_READONLY_AGENT, + AMDGPU_HSA_DATA_GLOBAL_PROGRAM, + AMDGPU_HSA_DATA_GLOBAL_AGENT, + AMDGPU_HSA_DATA_READONLY_AGENT, + AMDGPU_HSA_BSS_GLOBAL_PROGRAM, + AMDGPU_HSA_BSS_GLOBAL_AGENT, + AMDGPU_HSA_BSS_READONLY_AGENT, + AMDGPU_HSA_SECTION_LAST, +} amdgpu_hsa_elf_section_t; + +#endif // AMD_HSA_ELF_H diff --git a/hsa/amd_hsa_kernel_code.h b/hsa/amd_hsa_kernel_code.h new file mode 100644 index 0000000000..c00c88c024 --- /dev/null +++ b/hsa/amd_hsa_kernel_code.h @@ -0,0 +1,270 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_KERNEL_CODE_H +#define AMD_HSA_KERNEL_CODE_H + +#include "amd_hsa_common.h" +#include "hsa.h" + +// AMD Kernel Code Version Enumeration Values. +typedef uint32_t amd_kernel_code_version32_t; +enum amd_kernel_code_version_t { + AMD_KERNEL_CODE_VERSION_MAJOR = 1, + AMD_KERNEL_CODE_VERSION_MINOR = 1 +}; + +// AMD Machine Kind Enumeration Values. +typedef uint16_t amd_machine_kind16_t; +enum amd_machine_kind_t { + AMD_MACHINE_KIND_UNDEFINED = 0, + AMD_MACHINE_KIND_AMDGPU = 1 +}; + +// AMD Machine Version. +typedef uint16_t amd_machine_version16_t; + +// AMD Float Round Mode Enumeration Values. +enum amd_float_round_mode_t { + AMD_FLOAT_ROUND_MODE_NEAREST_EVEN = 0, + AMD_FLOAT_ROUND_MODE_PLUS_INFINITY = 1, + AMD_FLOAT_ROUND_MODE_MINUS_INFINITY = 2, + AMD_FLOAT_ROUND_MODE_ZERO = 3 +}; + +// AMD Float Denorm Mode Enumeration Values. +enum amd_float_denorm_mode_t { + AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT = 0, + AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT = 1, + AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE = 2, + AMD_FLOAT_DENORM_MODE_NO_FLUSH = 3 +}; + +// AMD Compute Program Resource Register One. +typedef uint32_t amd_compute_pgm_rsrc_one32_t; +enum amd_compute_pgm_rsrc_one_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT, 0, 6), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY, 10, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32, 12, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64, 14, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32, 16, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 18, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 20, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP, 21, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE, 22, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 23, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_BULKY, 24, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER, 25, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1, 26, 6) +}; + +// AMD System VGPR Workitem ID Enumeration Values. +enum amd_system_vgpr_workitem_id_t { + AMD_SYSTEM_VGPR_WORKITEM_ID_X = 0, + AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y = 1, + AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2, + AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3 +}; + +// AMD Compute Program Resource Register Two. +typedef uint32_t amd_compute_pgm_rsrc_two32_t; +enum amd_compute_pgm_rsrc_two_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 1, 5), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER, 6, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 7, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO, 10, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID, 11, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION, 14, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE, 15, 9), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO, 30, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1, 31, 1) +}; + +// AMD Element Byte Size Enumeration Values. +enum amd_element_byte_size_t { + AMD_ELEMENT_BYTE_SIZE_2 = 0, + AMD_ELEMENT_BYTE_SIZE_4 = 1, + AMD_ELEMENT_BYTE_SIZE_8 = 2, + AMD_ELEMENT_BYTE_SIZE_16 = 3 +}; + +// AMD Kernel Code Properties. +typedef uint32_t amd_kernel_code_properties32_t; +enum amd_kernel_code_properties_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR, 1, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR, 2, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID, 4, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, 7, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, 8, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z, 9, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32, 10, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED1, 11, 5), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS, 16, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE, 17, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_PTR64, 19, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK, 20, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED, 21, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED, 22, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED2, 23, 9) +}; + +// AMD Power Of Two Enumeration Values. +typedef uint8_t amd_powertwo8_t; +enum amd_powertwo_t { + AMD_POWERTWO_1 = 0, + AMD_POWERTWO_2 = 1, + AMD_POWERTWO_4 = 2, + AMD_POWERTWO_8 = 3, + AMD_POWERTWO_16 = 4, + AMD_POWERTWO_32 = 5, + AMD_POWERTWO_64 = 6, + AMD_POWERTWO_128 = 7, + AMD_POWERTWO_256 = 8 +}; + +// AMD Enabled Control Directive Enumeration Values. +typedef uint64_t amd_enabled_control_directive64_t; +enum amd_enabled_control_directive_t { + AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS = 1, + AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS = 2, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE = 4, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE = 8, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE = 16, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM = 32, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE = 64, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE = 128, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS = 256 +}; + +// AMD Exception Kind Enumeration Values. +typedef uint16_t amd_exception_kind16_t; +enum amd_exception_kind_t { + AMD_EXCEPTION_KIND_INVALID_OPERATION = 1, + AMD_EXCEPTION_KIND_DIVISION_BY_ZERO = 2, + AMD_EXCEPTION_KIND_OVERFLOW = 4, + AMD_EXCEPTION_KIND_UNDERFLOW = 8, + AMD_EXCEPTION_KIND_INEXACT = 16 +}; + +// AMD Control Directives. +#define AMD_CONTROL_DIRECTIVES_ALIGN_BYTES 64 +#define AMD_CONTROL_DIRECTIVES_ALIGN __ALIGNED__(AMD_CONTROL_DIRECTIVES_ALIGN_BYTES) +typedef AMD_CONTROL_DIRECTIVES_ALIGN struct amd_control_directives_s { + amd_enabled_control_directive64_t enabled_control_directives; + uint16_t enable_break_exceptions; + uint16_t enable_detect_exceptions; + uint32_t max_dynamic_group_size; + uint64_t max_flat_grid_size; + uint32_t max_flat_workgroup_size; + uint8_t required_dim; + uint8_t reserved1[3]; + uint64_t required_grid_size[3]; + uint32_t required_workgroup_size[3]; + uint8_t reserved2[60]; +} amd_control_directives_t; + +// AMD Kernel Code. +#define AMD_ISA_ALIGN_BYTES 256 +#define AMD_KERNEL_CODE_ALIGN_BYTES 64 +#define AMD_KERNEL_CODE_ALIGN __ALIGNED__(AMD_KERNEL_CODE_ALIGN_BYTES) +typedef AMD_KERNEL_CODE_ALIGN struct amd_kernel_code_s { + amd_kernel_code_version32_t amd_kernel_code_version_major; + amd_kernel_code_version32_t amd_kernel_code_version_minor; + amd_machine_kind16_t amd_machine_kind; + amd_machine_version16_t amd_machine_version_major; + amd_machine_version16_t amd_machine_version_minor; + amd_machine_version16_t amd_machine_version_stepping; + int64_t kernel_code_entry_byte_offset; + int64_t kernel_code_prefetch_byte_offset; + uint64_t kernel_code_prefetch_byte_size; + uint64_t max_scratch_backing_memory_byte_size; + amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1; + amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2; + amd_kernel_code_properties32_t kernel_code_properties; + uint32_t workitem_private_segment_byte_size; + uint32_t workgroup_group_segment_byte_size; + uint32_t gds_segment_byte_size; + uint64_t kernarg_segment_byte_size; + uint32_t workgroup_fbarrier_count; + uint16_t wavefront_sgpr_count; + uint16_t workitem_vgpr_count; + uint16_t reserved_vgpr_first; + uint16_t reserved_vgpr_count; + uint16_t reserved_sgpr_first; + uint16_t reserved_sgpr_count; + uint16_t debug_wavefront_private_segment_offset_sgpr; + uint16_t debug_private_segment_buffer_sgpr; + amd_powertwo8_t kernarg_segment_alignment; + amd_powertwo8_t group_segment_alignment; + amd_powertwo8_t private_segment_alignment; + amd_powertwo8_t wavefront_size; + int32_t call_convention; + uint8_t reserved1[12]; + uint64_t runtime_loader_kernel_symbol; + amd_control_directives_t control_directives; +} amd_kernel_code_t; + +// TODO: this struct should be completely gone once debugger designs/implements +// Debugger APIs. +typedef struct amd_runtime_loader_debug_info_s { + const void* elf_raw; + size_t elf_size; + const char *kernel_name; + const void *owning_segment; +} amd_runtime_loader_debug_info_t; + +#endif // AMD_HSA_KERNEL_CODE_H diff --git a/hsa/amd_hsa_queue.h b/hsa/amd_hsa_queue.h new file mode 100644 index 0000000000..9f16f9b2e5 --- /dev/null +++ b/hsa/amd_hsa_queue.h @@ -0,0 +1,154 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_QUEUE_H +#define AMD_HSA_QUEUE_H + +#include "amd_hsa_common.h" +#include "hsa.h" + +// AMD Queue Properties. +typedef uint32_t amd_queue_properties32_t; +enum amd_queue_properties_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_IS_PTR64, 1, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE, 4, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 5, 27) +}; + +// AMD Queue. +#define AMD_QUEUE_ALIGN_BYTES 64 +#define AMD_QUEUE_ALIGN __ALIGNED__(AMD_QUEUE_ALIGN_BYTES) + +// AMD Queue Capabilities. +typedef uint32_t amd_queue_capabilities32_t; +enum amd_queue_capabilities_t { + /* This version of CP FW supports dual-scratch and async-reclaim */ + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_CAPS_CP_ASYNC_RECLAIM, 0, 1), + + /* + * This version of ROCr supports async-reclaim and CP FW may access the + * V2 fields. + */ + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_CAPS_SW_ASYNC_RECLAIM, 1, 1), +}; + +/* This is the original amd_queue_t structure. The definition is only kept + * for reference purposes. This structure should not be used. */ +typedef struct AMD_QUEUE_ALIGN amd_queue_s { + hsa_queue_t hsa_queue; + uint32_t caps; + uint32_t reserved1[3]; + volatile uint64_t write_dispatch_id; + uint32_t group_segment_aperture_base_hi; + uint32_t private_segment_aperture_base_hi; + uint32_t max_cu_id; + uint32_t max_wave_id; + volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1; + volatile uint32_t legacy_doorbell_lock; + uint32_t reserved2[9]; + volatile uint64_t read_dispatch_id; + uint32_t read_dispatch_id_field_base_byte_offset; + uint32_t compute_tmpring_size; + uint32_t scratch_resource_descriptor[4]; + uint64_t scratch_backing_memory_location; + uint32_t reserved3[2]; + uint32_t scratch_wave64_lane_byte_size; + amd_queue_properties32_t queue_properties; + uint32_t reserved4[2]; + hsa_signal_t queue_inactive_signal; + uint32_t reserved5[14]; +} amd_queue_t; + +/* + * AMD_QUEUE Version 2 + * amd_queue_v2_t is backwards compatible with amd_queue_t structure and can + * be used with previous versions of CP FW. The added fields tagged as V2 are + * ignored when running previous versions of CP FW. + * CP FW will not try to access elements beyond the original 64-bytes + * (sizeof(amd_queue_t)) unless the AMD_QUEUE_CAPS_SW_ASYNC_RECLAIM bit is set. + */ + +#define MAX_NUM_XCC 128 +typedef struct scratch_last_used_index_xcc_s { + volatile uint64_t main; + volatile uint64_t alt; +} scratch_last_used_index_xcc_t; + +typedef struct AMD_QUEUE_ALIGN amd_queue_v2_s { + hsa_queue_t hsa_queue; + uint32_t caps; + uint32_t reserved1[3]; + volatile uint64_t write_dispatch_id; + uint32_t group_segment_aperture_base_hi; + uint32_t private_segment_aperture_base_hi; + uint32_t max_cu_id; + uint32_t max_wave_id; + volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1; + volatile uint32_t legacy_doorbell_lock; + uint32_t reserved2[9]; + volatile uint64_t read_dispatch_id; + uint32_t read_dispatch_id_field_base_byte_offset; + uint32_t compute_tmpring_size; + uint32_t scratch_resource_descriptor[4]; + uint64_t scratch_backing_memory_location; + uint64_t scratch_backing_memory_byte_size; + uint32_t scratch_wave64_lane_byte_size; + amd_queue_properties32_t queue_properties; + volatile uint64_t scratch_max_use_index; /* V2 */ + hsa_signal_t queue_inactive_signal; + volatile uint64_t alt_scratch_max_use_index; /* V2 */ + uint32_t alt_scratch_resource_descriptor[4]; /* V2 */ + uint64_t alt_scratch_backing_memory_location; /* V2 */ + uint32_t alt_scratch_dispatch_limit_x; /* V2 */ + uint32_t alt_scratch_dispatch_limit_y; /* V2 */ + uint32_t alt_scratch_dispatch_limit_z; /* V2 */ + uint32_t alt_scratch_wave64_lane_byte_size; /* V2 */ + uint32_t alt_compute_tmpring_size; /* V2 */ + uint32_t reserved5; + + scratch_last_used_index_xcc_t scratch_last_used_index[MAX_NUM_XCC]; +} amd_queue_v2_t; + +#endif // AMD_HSA_QUEUE_H diff --git a/hsa/amd_hsa_signal.h b/hsa/amd_hsa_signal.h new file mode 100644 index 0000000000..fa797599a0 --- /dev/null +++ b/hsa/amd_hsa_signal.h @@ -0,0 +1,79 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_SIGNAL_H +#define AMD_HSA_SIGNAL_H + +#include "amd_hsa_common.h" +#include "amd_hsa_queue.h" + +// AMD Signal Kind Enumeration Values. +typedef int64_t amd_signal_kind64_t; +enum amd_signal_kind_t { + AMD_SIGNAL_KIND_INVALID = 0, + AMD_SIGNAL_KIND_USER = 1, + AMD_SIGNAL_KIND_DOORBELL = -1, + AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2 +}; + +// AMD Signal. +#define AMD_SIGNAL_ALIGN_BYTES 64 +#define AMD_SIGNAL_ALIGN __ALIGNED__(AMD_SIGNAL_ALIGN_BYTES) +typedef struct AMD_SIGNAL_ALIGN amd_signal_s { + amd_signal_kind64_t kind; + union { + volatile int64_t value; + volatile uint64_t* hardware_doorbell_ptr; + }; + uint64_t event_mailbox_ptr; + uint32_t event_id; + uint32_t reserved1; + uint64_t start_ts; + uint64_t end_ts; + union { + amd_queue_v2_t* queue_ptr; + uint64_t reserved2; + }; + uint32_t reserved3[2]; +} amd_signal_t; + +#endif // AMD_HSA_SIGNAL_H diff --git a/hsa/hsa.h b/hsa/hsa.h new file mode 100644 index 0000000000..00753e992e --- /dev/null +++ b/hsa/hsa.h @@ -0,0 +1,5752 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_H_ +#define HSA_RUNTIME_INC_HSA_H_ + +#include /* size_t */ +#include /* uintXX_t */ + +#ifndef __cplusplus +#include /* bool */ +#endif /* __cplusplus */ + +// Placeholder for calling convention and import/export macros +#ifndef HSA_CALL +#define HSA_CALL +#endif + +#ifndef HSA_EXPORT_DECORATOR +#ifdef __GNUC__ +#define HSA_EXPORT_DECORATOR __attribute__ ((visibility ("default"))) +#else +#define HSA_EXPORT_DECORATOR +#endif +#endif +#define HSA_API_EXPORT HSA_EXPORT_DECORATOR HSA_CALL +#define HSA_API_IMPORT HSA_CALL + +#if !defined(HSA_API) && defined(HSA_EXPORT) +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +// Detect and set large model builds. +#undef HSA_LARGE_MODEL +#if defined(__LP64__) || defined(_M_X64) +#define HSA_LARGE_MODEL +#endif + +// Try to detect CPU endianness +#if !defined(LITTLEENDIAN_CPU) && !defined(BIGENDIAN_CPU) +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define LITTLEENDIAN_CPU +#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define BIGENDIAN_CPU +#elif defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) || defined(__loongarch64) || defined(__riscv) +#define LITTLEENDIAN_CPU +#endif +#endif + +#undef HSA_LITTLE_ENDIAN +#if defined(LITTLEENDIAN_CPU) +#define HSA_LITTLE_ENDIAN +#elif defined(BIGENDIAN_CPU) +#else +#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined" +#endif + +#ifndef HSA_DEPRECATED +#define HSA_DEPRECATED +//#ifdef __GNUC__ +//#define HSA_DEPRECATED __attribute__((deprecated)) +//#else +//#define HSA_DEPRECATED __declspec(deprecated) +//#endif +#endif + +#define HSA_VERSION_1_0 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** \addtogroup error-codes Error codes + * @{ + */ + +/** + * @brief Status codes. + */ +typedef enum { + /** + * The function has been executed successfully. + */ + HSA_STATUS_SUCCESS = 0x0, + /** + * A traversal over a list of elements has been interrupted by the + * application before completing. + */ + HSA_STATUS_INFO_BREAK = 0x1, + /** + * A generic error has occurred. + */ + HSA_STATUS_ERROR = 0x1000, + /** + * One of the actual arguments does not meet a precondition stated in the + * documentation of the corresponding formal argument. + */ + HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001, + /** + * The requested queue creation is not valid. + */ + HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002, + /** + * The requested allocation is not valid. + */ + HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003, + /** + * The agent is invalid. + */ + HSA_STATUS_ERROR_INVALID_AGENT = 0x1004, + /** + * The memory region is invalid. + */ + HSA_STATUS_ERROR_INVALID_REGION = 0x1005, + /** + * The signal is invalid. + */ + HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006, + /** + * The queue is invalid. + */ + HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007, + /** + * The HSA runtime failed to allocate the necessary resources. This error + * may also occur when the HSA runtime needs to spawn threads or create + * internal OS-specific events. + */ + HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008, + /** + * The AQL packet is malformed. + */ + HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009, + /** + * An error has been detected while releasing a resource. + */ + HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A, + /** + * An API other than ::hsa_init has been invoked while the reference count + * of the HSA runtime is 0. + */ + HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B, + /** + * The maximum reference count for the object has been reached. + */ + HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C, + /** + * The arguments passed to a functions are not compatible. + */ + HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D, + /** + * The index is invalid. + */ + HSA_STATUS_ERROR_INVALID_INDEX = 0x100E, + /** + * The instruction set architecture is invalid. + */ + HSA_STATUS_ERROR_INVALID_ISA = 0x100F, + /** + * The instruction set architecture name is invalid. + */ + HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017, + /** + * The code object is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010, + /** + * The executable is invalid. + */ + HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011, + /** + * The executable is frozen. + */ + HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012, + /** + * There is no symbol with the given name. + */ + HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013, + /** + * The variable is already defined. + */ + HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014, + /** + * The variable is undefined. + */ + HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015, + /** + * An HSAIL operation resulted in a hardware exception. + */ + HSA_STATUS_ERROR_EXCEPTION = 0x1016, + /** + * The code object symbol is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_SYMBOL = 0x1018, + /** + * The executable symbol is invalid. + */ + HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL = 0x1019, + /** + * The file descriptor is invalid. + */ + HSA_STATUS_ERROR_INVALID_FILE = 0x1020, + /** + * The code object reader is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER = 0x1021, + /** + * The cache is invalid. + */ + HSA_STATUS_ERROR_INVALID_CACHE = 0x1022, + /** + * The wavefront is invalid. + */ + HSA_STATUS_ERROR_INVALID_WAVEFRONT = 0x1023, + /** + * The signal group is invalid. + */ + HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP = 0x1024, + /** + * The HSA runtime is not in the configuration state. + */ + HSA_STATUS_ERROR_INVALID_RUNTIME_STATE = 0x1025, + /** + * The queue received an error that may require process termination. + */ + HSA_STATUS_ERROR_FATAL = 0x1026 +} hsa_status_t; + +/** + * @brief Query additional information about a status code. + * + * @param[in] status Status code. + * + * @param[out] status_string A NUL-terminated string that describes the error + * status. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p status is an invalid + * status code, or @p status_string is NULL. + */ +hsa_status_t HSA_API hsa_status_string( + hsa_status_t status, + const char ** status_string); + +/** @} */ + +/** \defgroup common Common Definitions + * @{ + */ + +/** + * @brief Three-dimensional coordinate. + */ +typedef struct hsa_dim3_s { + /** + * X dimension. + */ + uint32_t x; + + /** + * Y dimension. + */ + uint32_t y; + + /** + * Z dimension. + */ + uint32_t z; +} hsa_dim3_t; + +/** + * @brief Access permissions. + */ +typedef enum { + /** + * Used to remove existing access + */ + HSA_ACCESS_PERMISSION_NONE = 0, + /** + * Read-only access. + */ + HSA_ACCESS_PERMISSION_RO = 1, + /** + * Write-only access. + */ + HSA_ACCESS_PERMISSION_WO = 2, + /** + * Read and write access. + */ + HSA_ACCESS_PERMISSION_RW = 3 +} hsa_access_permission_t; + +/** + * @brief POSIX file descriptor. + */ +typedef int hsa_file_t; + +/** @} **/ + + +/** \defgroup initshutdown Initialization and Shut Down + * @{ + */ + +/** + * @brief Initialize the HSA runtime. + * + * @details Initializes the HSA runtime if it is not already initialized, and + * increases the reference counter associated with the HSA runtime for the + * current process. Invocation of any HSA function other than ::hsa_init results + * in undefined behavior if the current HSA runtime reference counter is less + * than one. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_REFCOUNT_OVERFLOW The HSA runtime reference + * count reaches INT32_MAX. + */ +hsa_status_t HSA_API hsa_init(); + +/** + * @brief Shut down the HSA runtime. + * + * @details Decreases the reference count of the HSA runtime instance. When the + * reference count reaches 0, the HSA runtime is no longer considered valid + * but the application might call ::hsa_init to initialize the HSA runtime + * again. + * + * Once the reference count of the HSA runtime reaches 0, all the resources + * associated with it (queues, signals, agent information, etc.) are + * considered invalid and any attempt to reference them in subsequent API calls + * results in undefined behavior. When the reference count reaches 0, the HSA + * runtime may release resources associated with it. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_shut_down(); + +/** @} **/ + +/** \defgroup agentinfo System and Agent Information + * @{ + */ + +/** + * @brief Endianness. A convention used to interpret the bytes making up a data + * word. + */ +typedef enum { + /** + * The least significant byte is stored in the smallest address. + */ + HSA_ENDIANNESS_LITTLE = 0, + /** + * The most significant byte is stored in the smallest address. + */ + HSA_ENDIANNESS_BIG = 1 +} hsa_endianness_t; + +/** + * @brief Machine model. A machine model determines the size of certain data + * types in HSA runtime and an agent. + */ +typedef enum { + /** + * Small machine model. Addresses use 32 bits. + */ + HSA_MACHINE_MODEL_SMALL = 0, + /** + * Large machine model. Addresses use 64 bits. + */ + HSA_MACHINE_MODEL_LARGE = 1 +} hsa_machine_model_t; + +/** + * @brief Profile. A profile indicates a particular level of feature + * support. For example, in the base profile the application must use the HSA + * runtime allocator to reserve shared virtual memory, while in the full profile + * any host pointer can be shared across all the agents. + */ +typedef enum { + /** + * Base profile. + */ + HSA_PROFILE_BASE = 0, + /** + * Full profile. + */ + HSA_PROFILE_FULL = 1 +} hsa_profile_t; + +/** + * @brief System attributes. + */ +typedef enum { + /** + * Major version of the HSA runtime specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_SYSTEM_INFO_VERSION_MAJOR = 0, + /** + * Minor version of the HSA runtime specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_SYSTEM_INFO_VERSION_MINOR = 1, + /** + * Current timestamp. The value of this attribute monotonically increases at a + * constant rate. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_TIMESTAMP = 2, + /** + * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is + * in the range 1-400MHz. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3, + /** + * Maximum duration of a signal wait operation. Expressed as a count based on + * the timestamp frequency. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT = 4, + /** + * Endianness of the system. The type of this attribute is ::hsa_endianness_t. + */ + HSA_SYSTEM_INFO_ENDIANNESS = 5, + /** + * Machine model supported by the HSA runtime. The type of this attribute is + * ::hsa_machine_model_t. + */ + HSA_SYSTEM_INFO_MACHINE_MODEL = 6, + /** + * Bit-mask indicating which extensions are supported by the + * implementation. An extension with an ID of @p i is supported if the bit at + * position @p i is set. The type of this attribute is uint8_t[128]. + */ + HSA_SYSTEM_INFO_EXTENSIONS = 7, + /** + * String containing the ROCr build identifier. + */ + HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200, + /** + * Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201, + // TODO: Should this be per Agent? + /** + * Returns true if all Agents have access to system allocated memory (such as + * that allocated by mmap, malloc, or new) by default. + * If false then system allocated memory may only be made SVM accessible to + * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes. + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202, + /** + * Returns true if mwaitx is enabled on this system + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_MWAITX_ENABLED = 0x203, + /** + * Returns true if DMABUF APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_DMABUF_SUPPORTED = 0x204, + /** + * Returns true if Virtual Memory APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED = 0x205, + /** + * Returns true if XNACK is enabled on this system. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_XNACK_ENABLED = 0x206, + /** + * Major version of the HSA runtime extension specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_AMD_SYSTEM_INFO_EXT_VERSION_MAJOR = 0x207, + /** + * Minor version of the HSA runtime extension specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_AMD_SYSTEM_INFO_EXT_VERSION_MINOR = 0x208, +} hsa_system_info_t; + +/** + * @brief Get the current value of a system attribute. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * system attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_system_get_info( + hsa_system_info_t attribute, + void* value); + +/** + * @brief HSA extensions. + */ +typedef enum { + /** + * Finalizer extension. + */ + HSA_EXTENSION_FINALIZER = 0, + /** + * Images extension. + */ + HSA_EXTENSION_IMAGES = 1, + + /** + * Performance counter extension. + */ + HSA_EXTENSION_PERFORMANCE_COUNTERS = 2, + + /** + * Profiling events extension. + */ + HSA_EXTENSION_PROFILING_EVENTS = 3, + /** + * Extension count. + */ + HSA_EXTENSION_STD_LAST = 3, + /** + * First AMD extension number. + */ + HSA_AMD_FIRST_EXTENSION = 0x200, + /** + * Profiler extension. + */ + HSA_EXTENSION_AMD_PROFILER = 0x200, + /** + * Loader extension. + */ + HSA_EXTENSION_AMD_LOADER = 0x201, + /** + * AqlProfile extension. + */ + HSA_EXTENSION_AMD_AQLPROFILE = 0x202, + /** + * PC Sampling extension. + */ + HSA_EXTENSION_AMD_PC_SAMPLING = 0x203, + /** + * Last AMD extension. + */ + HSA_AMD_LAST_EXTENSION = 0x203 +} hsa_extension_t; + +/** + * @brief Query the name of a given extension. + * + * @param[in] extension Extension identifier. If the extension is not supported + * by the implementation (see ::HSA_SYSTEM_INFO_EXTENSIONS), the behavior + * is undefined. + * + * @param[out] name Pointer to a memory location where the HSA runtime stores + * the extension name. The extension name is a NUL-terminated string. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p name is NULL. + */ +hsa_status_t HSA_API hsa_extension_get_name( + uint16_t extension, + const char **name); + +/** + * @deprecated + * + * @brief Query if a given version of an extension is supported by the HSA + * implementation. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number. + * + * @param[in] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p result is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_system_extension_supported( + uint16_t extension, + uint16_t version_major, + uint16_t version_minor, + bool* result); + +/** + * @brief Query if a given version of an extension is supported by the HSA + * implementation. All minor versions from 0 up to the returned @p version_minor + * must be supported by the implementation. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number. + * + * @param[out] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p version_minor is NULL, or @p result is NULL. + */ +hsa_status_t HSA_API hsa_system_major_extension_supported( + uint16_t extension, + uint16_t version_major, + uint16_t *version_minor, + bool* result); + + +/** + * @deprecated + * + * @brief Retrieve the function pointers corresponding to a given version of an + * extension. Portable applications are expected to invoke the extension API + * using the returned function pointers + * + * @details The application is responsible for verifying that the given version + * of the extension is supported by the HSA implementation (see + * ::hsa_system_extension_supported). If the given combination of extension, + * major version, and minor version is not supported by the implementation, the + * behavior is undefined. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number for which to retrieve the + * function pointer table. + * + * @param[in] version_minor Minor version number for which to retrieve the + * function pointer table. + * + * @param[out] table Pointer to an application-allocated function pointer table + * that is populated by the HSA runtime. Must not be NULL. The memory associated + * with table can be reused or freed after the function returns. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p table is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_system_get_extension_table( + uint16_t extension, + uint16_t version_major, + uint16_t version_minor, + void *table); + +/** + * @brief Retrieve the function pointers corresponding to a given major version + * of an extension. Portable applications are expected to invoke the extension + * API using the returned function pointers. + * + * @details The application is responsible for verifying that the given major + * version of the extension is supported by the HSA implementation (see + * ::hsa_system_major_extension_supported). If the given combination of extension + * and major version is not supported by the implementation, the behavior is + * undefined. Additionally if the length doesn't allow space for a full minor + * version, it is implementation defined if only some of the function pointers for + * that minor version get written. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number for which to retrieve the + * function pointer table. + * + * @param[in] table_length Size in bytes of the function pointer table to be + * populated. The implementation will not write more than this many bytes to the + * table. + * + * @param[out] table Pointer to an application-allocated function pointer table + * that is populated by the HSA runtime. Must not be NULL. The memory associated + * with table can be reused or freed after the function returns. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p table is NULL. + */ +hsa_status_t HSA_API hsa_system_get_major_extension_table( + uint16_t extension, + uint16_t version_major, + size_t table_length, + void *table); + +/** + * @brief Struct containing an opaque handle to an agent, a device that participates in + * the HSA memory model. An agent can submit AQL packets for execution, and + * may also accept AQL packets for execution (agent dispatch packets or kernel + * dispatch packets launching HSAIL-derived binaries). + */ +typedef struct hsa_agent_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_agent_t; + +/** + * @brief Agent features. + */ +typedef enum { + /** + * The agent supports AQL packets of kernel dispatch type. If this + * feature is enabled, the agent is also a kernel agent. + */ + HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1, + /** + * The agent supports AQL packets of agent dispatch type. + */ + HSA_AGENT_FEATURE_AGENT_DISPATCH = 2 +} hsa_agent_feature_t; + +/** + * @brief Hardware device type. + */ +typedef enum { + /** + * CPU device. + */ + HSA_DEVICE_TYPE_CPU = 0, + /** + * GPU device. + */ + HSA_DEVICE_TYPE_GPU = 1, + /** + * DSP device. + */ + HSA_DEVICE_TYPE_DSP = 2, + /** + * AI Engine (AIE) device. + */ + HSA_DEVICE_TYPE_AIE = 3 +} hsa_device_type_t; + +/** + * @brief Default floating-point rounding mode. + */ +typedef enum { + /** + * Use a default floating-point rounding mode specified elsewhere. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0, + /** + * Operations that specify the default floating-point mode are rounded to zero + * by default. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1, + /** + * Operations that specify the default floating-point mode are rounded to the + * nearest representable number and that ties should be broken by selecting + * the value with an even least significant bit. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2 +} hsa_default_float_rounding_mode_t; + +/** + * @brief Agent attributes. + */ +typedef enum { + /** + * Agent name. The type of this attribute is a NUL-terminated char[64]. The + * name must be at most 63 characters long (not including the NUL terminator) + * and all array elements not used for the name must be NUL. + */ + HSA_AGENT_INFO_NAME = 0, + /** + * Name of vendor. The type of this attribute is a NUL-terminated char[64]. + * The name must be at most 63 characters long (not including the NUL + * terminator) and all array elements not used for the name must be NUL. + */ + HSA_AGENT_INFO_VENDOR_NAME = 1, + /** + * Agent capability. The type of this attribute is ::hsa_agent_feature_t. + */ + HSA_AGENT_INFO_FEATURE = 2, + /** + * @deprecated Query ::HSA_ISA_INFO_MACHINE_MODELS for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Machine model supported by the agent. The type of this attribute is + * ::hsa_machine_model_t. + */ + HSA_AGENT_INFO_MACHINE_MODEL = 3, + /** + * @deprecated Query ::HSA_ISA_INFO_PROFILES for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Profile supported by the agent. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_AGENT_INFO_PROFILE = 4, + /** + * @deprecated Query ::HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES for a given + * intruction set architecture supported by the agent instead. If more than + * one ISA is supported by the agent, the returned value corresponds to the + * first ISA enumerated by ::hsa_agent_iterate_isas. + * + * Default floating-point rounding mode. The type of this attribute is + * ::hsa_default_float_rounding_mode_t, but the value + * ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT is not allowed. + */ + HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5, + /** + * @deprecated Query ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES + * for a given intruction set architecture supported by the agent instead. If + * more than one ISA is supported by the agent, the returned value corresponds + * to the first ISA enumerated by ::hsa_agent_iterate_isas. + * + * A bit-mask of ::hsa_default_float_rounding_mode_t values, representing the + * default floating-point rounding modes supported by the agent in the Base + * profile. The type of this attribute is uint32_t. The default floating-point + * rounding mode (::HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE) bit must not + * be set. + */ + HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 23, + /** + * @deprecated Query ::HSA_ISA_INFO_FAST_F16_OPERATION for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Flag indicating that the f16 HSAIL operation is at least as fast as the + * f32 operation in the current agent. The value of this attribute is + * undefined if the agent is not a kernel agent. The type of this + * attribute is bool. + */ + HSA_AGENT_INFO_FAST_F16_OPERATION = 24, + /** + * @deprecated Query ::HSA_WAVEFRONT_INFO_SIZE for a given wavefront and + * intruction set architecture supported by the agent instead. If more than + * one ISA is supported by the agent, the returned value corresponds to the + * first ISA enumerated by ::hsa_agent_iterate_isas and the first wavefront + * enumerated by ::hsa_isa_iterate_wavefronts for that ISA. + * + * Number of work-items in a wavefront. Must be a power of 2 in the range + * [1,256]. The value of this attribute is undefined if the agent is not + * a kernel agent. The type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_WAVEFRONT_SIZE = 6, + /** + * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_DIM for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum number of work-items of each dimension of a work-group. Each + * maximum must be greater than 0. No maximum can exceed the value of + * ::HSA_AGENT_INFO_WORKGROUP_MAX_SIZE. The value of this attribute is + * undefined if the agent is not a kernel agent. The type of this + * attribute is uint16_t[3]. + */ + HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7, + /** + * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum total number of work-items in a work-group. The value of this + * attribute is undefined if the agent is not a kernel agent. The type + * of this attribute is uint32_t. + */ + HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8, + /** + * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_DIM for a given intruction set + * architecture supported by the agent instead. + * + * Maximum number of work-items of each dimension of a grid. Each maximum must + * be greater than 0, and must not be smaller than the corresponding value in + * ::HSA_AGENT_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of + * ::HSA_AGENT_INFO_GRID_MAX_SIZE. The value of this attribute is undefined + * if the agent is not a kernel agent. The type of this attribute is + * ::hsa_dim3_t. + */ + HSA_AGENT_INFO_GRID_MAX_DIM = 9, + /** + * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_SIZE for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum total number of work-items in a grid. The value of this attribute + * is undefined if the agent is not a kernel agent. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_GRID_MAX_SIZE = 10, + /** + * @deprecated Query ::HSA_ISA_INFO_FBARRIER_MAX_SIZE for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum number of fbarriers per work-group. Must be at least 32. The value + * of this attribute is undefined if the agent is not a kernel agent. The + * type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11, + /** + * @deprecated The maximum number of queues is not statically determined. + * + * Maximum number of queues that can be active (created but not destroyed) at + * one time in the agent. The type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_QUEUES_MAX = 12, + /** + * Minimum number of packets that a queue created in the agent + * can hold. Must be a power of 2 greater than 0. Must not exceed + * the value of ::HSA_AGENT_INFO_QUEUE_MAX_SIZE. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13, + /** + * Maximum number of packets that a queue created in the agent can + * hold. Must be a power of 2 greater than 0. The type of this attribute + * is uint32_t. + */ + HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14, + /** + * Type of a queue created in the agent. The type of this attribute is + * ::hsa_queue_type32_t. + */ + HSA_AGENT_INFO_QUEUE_TYPE = 15, + /** + * @deprecated NUMA information is not exposed anywhere else in the API. + * + * Identifier of the NUMA node associated with the agent. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_NODE = 16, + /** + * Type of hardware device associated with the agent. The type of this + * attribute is ::hsa_device_type_t. + */ + HSA_AGENT_INFO_DEVICE = 17, + /** + * @deprecated Query ::hsa_agent_iterate_caches to retrieve information about + * the caches present in a given agent. + * + * Array of data cache sizes (L1..L4). Each size is expressed in bytes. A size + * of 0 for a particular level indicates that there is no cache information + * for that level. The type of this attribute is uint32_t[4]. + */ + HSA_AGENT_INFO_CACHE_SIZE = 18, + /** + * @deprecated An agent may support multiple instruction set + * architectures. See ::hsa_agent_iterate_isas. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Instruction set architecture of the agent. The type of this attribute + * is ::hsa_isa_t. + */ + HSA_AGENT_INFO_ISA = 19, + /** + * Bit-mask indicating which extensions are supported by the agent. An + * extension with an ID of @p i is supported if the bit at position @p i is + * set. The type of this attribute is uint8_t[128]. + */ + HSA_AGENT_INFO_EXTENSIONS = 20, + /** + * Major version of the HSA runtime specification supported by the + * agent. The type of this attribute is uint16_t. + */ + HSA_AGENT_INFO_VERSION_MAJOR = 21, + /** + * Minor version of the HSA runtime specification supported by the + * agent. The type of this attribute is uint16_t. + */ + HSA_AGENT_INFO_VERSION_MINOR = 22, + /** + * This enum does not have a fixed underlying type, thus in C++ post D2338: + * If the enumeration type does not have a fixed underlying type, the value is + * unchanged if the original value is within the range of the enumeration + * values (9.7.1 [dcl.enum]), and otherwise, the behavior is + * undefined. + * Thus increase the range of this enum to encompass vendor extensions. + */ + HSA_AGENT_INFO_LAST = INT32_MAX +} hsa_agent_info_t; + +/** + * @brief Get the current value of an attribute for a given agent. + * + * @param[in] agent A valid agent. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * agent attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_agent_get_info( + hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +/** + * @brief Iterate over the available agents, and invoke an + * application-defined callback on every iteration. + * + * @param[in] callback Callback to be invoked once per agent. The HSA + * runtime passes two arguments to the callback: the agent and the + * application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_iterate_agents returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. +*/ +hsa_status_t HSA_API hsa_iterate_agents( + hsa_status_t (*callback)(hsa_agent_t agent, void* data), + void* data); + +/* + +// If we do not know the size of an attribute, we need to query it first +// Note: this API will not be in the spec unless needed +hsa_status_t HSA_API hsa_agent_get_info_size( + hsa_agent_t agent, + hsa_agent_info_t attribute, + size_t* size); + +// Set the value of an agents attribute +// Note: this API will not be in the spec unless needed +hsa_status_t HSA_API hsa_agent_set_info( + hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +*/ + +/** + * @brief Exception policies applied in the presence of hardware exceptions. + */ +typedef enum { + /** + * If a hardware exception is detected, a work-item signals an exception. + */ + HSA_EXCEPTION_POLICY_BREAK = 1, + /** + * If a hardware exception is detected, a hardware status bit is set. + */ + HSA_EXCEPTION_POLICY_DETECT = 2 +} hsa_exception_policy_t; + +/** + * @deprecated Use ::hsa_isa_get_exception_policies for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, this function uses the first value returned by + * ::hsa_agent_iterate_isas. + * + * @brief Retrieve the exception policy support for a given combination of + * agent and profile + * + * @param[in] agent Agent. + * + * @param[in] profile Profile. + * + * @param[out] mask Pointer to a memory location where the HSA runtime stores a + * mask of ::hsa_exception_policy_t values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid + * profile, or @p mask is NULL. + * + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_get_exception_policies( + hsa_agent_t agent, + hsa_profile_t profile, + uint16_t *mask); + +/** + * @brief Cache handle. + */ +typedef struct hsa_cache_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_cache_t; + +/** + * @brief Cache attributes. + */ +typedef enum { + /** + * The length of the cache name in bytes, not including the NUL terminator. + * The type of this attribute is uint32_t. + */ + HSA_CACHE_INFO_NAME_LENGTH = 0, + /** + * Human-readable description. The type of this attribute is a NUL-terminated + * character array with the length equal to the value of + * ::HSA_CACHE_INFO_NAME_LENGTH attribute. + */ + HSA_CACHE_INFO_NAME = 1, + /** + * Cache level. A L1 cache must return a value of 1, a L2 must return a value + * of 2, and so on. The type of this attribute is uint8_t. + */ + HSA_CACHE_INFO_LEVEL = 2, + /** + * Cache size, in bytes. A value of 0 indicates that there is no size + * information available. The type of this attribute is uint32_t. + */ + HSA_CACHE_INFO_SIZE = 3 +} hsa_cache_info_t; + +/** + * @brief Get the current value of an attribute for a given cache object. + * + * @param[in] cache Cache. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CACHE The cache is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is + * NULL. + */ +hsa_status_t HSA_API hsa_cache_get_info( + hsa_cache_t cache, + hsa_cache_info_t attribute, + void* value); + +/** + * @brief Iterate over the memory caches of a given agent, and + * invoke an application-defined callback on every iteration. + * + * @details Caches are visited in ascending order according to the value of the + * ::HSA_CACHE_INFO_LEVEL attribute. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per cache that is present in + * the agent. The HSA runtime passes two arguments to the callback: the cache + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * that value is returned. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_caches( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_cache_t cache, void* data), + void* data); + +/** + * @deprecated + * + * @brief Query if a given version of an extension is supported by an agent + * + * @param[in] extension Extension identifier. + * + * @param[in] agent Agent. + * + * @param[in] version_major Major version number. + * + * @param[in] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. The result must be false if + * ::hsa_system_extension_supported returns false for the same extension + * version. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p result is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_extension_supported( + uint16_t extension, + hsa_agent_t agent, + uint16_t version_major, + uint16_t version_minor, + bool* result); + +/** + * @brief Query if a given version of an extension is supported by an agent. All + * minor versions from 0 up to the returned @p version_minor must be supported. + * + * @param[in] extension Extension identifier. + * + * @param[in] agent Agent. + * + * @param[in] version_major Major version number. + * + * @param[out] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. The result must be false if + * ::hsa_system_extension_supported returns false for the same extension + * version. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p version_minor is NULL, or @p result is NULL. + */ +hsa_status_t HSA_API hsa_agent_major_extension_supported( + uint16_t extension, + hsa_agent_t agent, + uint16_t version_major, + uint16_t *version_minor, + bool* result); + + +/** @} */ + + +/** \defgroup signals Signals + * @{ + */ + +/** + * @brief Signal handle. + */ +typedef struct hsa_signal_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. The value 0 is reserved. + */ + uint64_t handle; +} hsa_signal_t; + +/** + * @brief Signal value. The value occupies 32 bits in small machine mode, and 64 + * bits in large machine mode. + */ +#ifdef HSA_LARGE_MODEL + typedef int64_t hsa_signal_value_t; +#else + typedef int32_t hsa_signal_value_t; +#endif + +/** + * @brief Create a signal. + * + * @param[in] initial_value Initial value of the signal. + * + * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that + * any agent might wait on the signal. + * + * @param[in] consumers List of agents that might consume (wait on) the + * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the + * HSA runtime might use the list to optimize the handling of the signal + * object. If an agent not listed in @p consumers waits on the returned + * signal, the behavior is undefined. The memory associated with @p consumers + * can be reused or freed after the function returns. + * + * @param[out] signal Pointer to a memory location where the HSA runtime will + * store the newly created signal handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p + * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers + * contains duplicates. + */ +hsa_status_t HSA_API hsa_signal_create( + hsa_signal_value_t initial_value, + uint32_t num_consumers, + const hsa_agent_t *consumers, + hsa_signal_t *signal); + +/** + * @brief Destroy a signal previous created by ::hsa_signal_create. + * + * @param[in] signal Signal. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The handle in @p signal is 0. + */ +hsa_status_t HSA_API hsa_signal_destroy( + hsa_signal_t signal); + +/** + * @brief Atomically read the current value of a signal. + * + * @param[in] signal Signal. + * + * @return Value of the signal. +*/ +hsa_signal_value_t HSA_API hsa_signal_load_scacquire( + hsa_signal_t signal); + +/** + * @copydoc hsa_signal_load_scacquire + */ +hsa_signal_value_t HSA_API hsa_signal_load_relaxed( + hsa_signal_t signal); + +/** + * @deprecated Renamed as ::hsa_signal_load_scacquire. + * + * @copydoc hsa_signal_load_scacquire +*/ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_load_acquire( + hsa_signal_t signal); + +/** + * @brief Atomically set the value of a signal. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. + * + * @param[in] value New signal value. + */ +void HSA_API hsa_signal_store_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_store_relaxed + */ +void HSA_API hsa_signal_store_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_store_screlease. + * + * @copydoc hsa_signal_store_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_store_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal without necessarily notifying the + * the agents waiting on it. + * + * @details The agents waiting on @p signal may not wake up even when the new + * value satisfies their wait condition. If the application wants to update the + * signal and there is no need to notify any agent, invoking this function can + * be more efficient than calling the non-silent counterpart. + * + * @param[in] signal Signal. + * + * @param[in] value New signal value. + */ +void HSA_API hsa_signal_silent_store_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_silent_store_relaxed + */ +void HSA_API hsa_signal_silent_store_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal and return its previous value. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value New value. + * + * @return Value of the signal prior to the exchange. + * + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_exchange_scacq_screl. + * + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_exchange_scacquire. + * + * @copydoc hsa_signal_exchange_scacquire + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); +/** + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_exchange_screlease. + * + * @copydoc hsa_signal_exchange_screlease + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal if the observed value is equal to + * the expected value. The observed value is returned regardless of whether the + * replacement was done. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue + * doorbell signal, the behavior is undefined. + * + * @param[in] expected Value to compare with. + * + * @param[in] value New value. + * + * @return Observed value of the signal. + * + */ +hsa_signal_value_t HSA_API hsa_signal_cas_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_cas_scacq_screl. + * + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_cas_scacquire( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_cas_scacquire. + * + * @copydoc hsa_signal_cas_scacquire + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acquire( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_cas_relaxed( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_cas_screlease( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_cas_screlease. + * + * @copydoc hsa_signal_cas_screlease + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_release( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @brief Atomically increment the value of a signal by a given amount. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to add to the value of the signal. + * + */ +void HSA_API hsa_signal_add_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_add_scacq_screl. + * + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_add_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API hsa_signal_add_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_add_scacquire. + * + * @copydoc hsa_signal_add_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_add_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API hsa_signal_add_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API hsa_signal_add_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_add_screlease. + * + * @copydoc hsa_signal_add_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_add_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically decrement the value of a signal by a given amount. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to subtract from the value of the signal. + * + */ +void HSA_API hsa_signal_subtract_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_subtract_scacq_screl. + * + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_subtract_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API hsa_signal_subtract_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_subtract_scacquire. + * + * @copydoc hsa_signal_subtract_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_subtract_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API hsa_signal_subtract_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API hsa_signal_subtract_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_subtract_screlease. + * + * @copydoc hsa_signal_subtract_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_subtract_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise AND operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to AND with the value of the signal. + * + */ +void HSA_API hsa_signal_and_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_and_scacq_screl. + * + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_and_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API hsa_signal_and_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_and_scacquire. + * + * @copydoc hsa_signal_and_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_and_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API hsa_signal_and_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API hsa_signal_and_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_and_screlease. + * + * @copydoc hsa_signal_and_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_and_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise OR operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to OR with the value of the signal. + */ +void HSA_API hsa_signal_or_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_or_scacq_screl. + * + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_or_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API hsa_signal_or_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_or_scacquire. + * + * @copydoc hsa_signal_or_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_or_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API hsa_signal_or_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API hsa_signal_or_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_or_screlease. + * + * @copydoc hsa_signal_or_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_or_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise XOR operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to XOR with the value of the signal. + * + */ +void HSA_API hsa_signal_xor_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_xor_scacq_screl. + * + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_xor_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API hsa_signal_xor_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_xor_scacquire. + * + * @copydoc hsa_signal_xor_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_xor_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API hsa_signal_xor_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API hsa_signal_xor_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_xor_screlease. + * + * @copydoc hsa_signal_xor_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_xor_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Wait condition operator. + */ +typedef enum { + /** + * The two operands are equal. + */ + HSA_SIGNAL_CONDITION_EQ = 0, + /** + * The two operands are not equal. + */ + HSA_SIGNAL_CONDITION_NE = 1, + /** + * The first operand is less than the second operand. + */ + HSA_SIGNAL_CONDITION_LT = 2, + /** + * The first operand is greater than or equal to the second operand. + */ + HSA_SIGNAL_CONDITION_GTE = 3 +} hsa_signal_condition_t; + +/** + * @brief State of the application thread during a signal wait. + */ +typedef enum { + /** + * The application thread may be rescheduled while waiting on the signal. + */ + HSA_WAIT_STATE_BLOCKED = 0, + /** + * The application thread stays active while waiting on a signal. + */ + HSA_WAIT_STATE_ACTIVE = 1 +} hsa_wait_state_t; + + +/** + * @brief Wait until a signal value satisfies a specified condition, or a + * certain amount of time has elapsed. + * + * @details A wait operation can spuriously resume at any time sooner than the + * timeout (for example, due to system or other external factors) even when the + * condition has not been met. + * + * The function is guaranteed to return if the signal value satisfies the + * condition at some point in time during the wait, but the value returned to + * the application might not satisfy the condition. The application must ensure + * that signals are used in such way that wait wakeup conditions are not + * invalidated before dependent threads have woken up. + * + * When the wait operation internally loads the value of the passed signal, it + * uses the memory order indicated in the function name. + * + * @param[in] signal Signal. + * + * @param[in] condition Condition used to compare the signal value with @p + * compare_value. + * + * @param[in] compare_value Value to compare with. + * + * @param[in] timeout_hint Maximum duration of the wait. Specified in the same + * unit as the system timestamp. The operation might block for a shorter or + * longer time even if the condition is not met. A value of UINT64_MAX indicates + * no maximum. + * + * @param[in] wait_state_hint Hint used by the application to indicate the + * preferred waiting state. The actual waiting state is ultimately decided by + * HSA runtime and may not match the provided hint. A value of + * ::HSA_WAIT_STATE_ACTIVE may improve the latency of response to a signal + * update by avoiding rescheduling overhead. + * + * @return Observed value of the signal, which might not satisfy the specified + * condition. + * +*/ +hsa_signal_value_t HSA_API hsa_signal_wait_scacquire( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @copydoc hsa_signal_wait_scacquire + */ +hsa_signal_value_t HSA_API hsa_signal_wait_relaxed( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @deprecated Renamed as ::hsa_signal_wait_scacquire. + * + * @copydoc hsa_signal_wait_scacquire + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_wait_acquire( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @brief Group of signals. + */ +typedef struct hsa_signal_group_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_signal_group_t; + +/** + * @brief Create a signal group. + * + * @param[in] num_signals Number of elements in @p signals. Must not be 0. + * + * @param[in] signals List of signals in the group. The list must not contain + * any repeated elements. Must not be NULL. + * + * @param[in] num_consumers Number of elements in @p consumers. Must not be 0. + * + * @param[in] consumers List of agents that might consume (wait on) the signal + * group. The list must not contain repeated elements, and must be a subset of + * the set of agents that are allowed to wait on all the signals in the + * group. If an agent not listed in @p consumers waits on the returned group, + * the behavior is undefined. The memory associated with @p consumers can be + * reused or freed after the function returns. Must not be NULL. + * + * @param[out] signal_group Pointer to newly created signal group. Must not be + * NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_signals is 0, @p signals + * is NULL, @p num_consumers is 0, @p consumers is NULL, or @p signal_group is + * NULL. + */ +hsa_status_t HSA_API hsa_signal_group_create( + uint32_t num_signals, + const hsa_signal_t *signals, + uint32_t num_consumers, + const hsa_agent_t *consumers, + hsa_signal_group_t *signal_group); + +/** + * @brief Destroy a signal group previous created by ::hsa_signal_group_create. + * + * @param[in] signal_group Signal group. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid. + */ +hsa_status_t HSA_API hsa_signal_group_destroy( + hsa_signal_group_t signal_group); + +/** + * @brief Wait until the value of at least one of the signals in a signal group + * satisfies its associated condition. + * + * @details The function is guaranteed to return if the value of at least one of + * the signals in the group satisfies its associated condition at some point in + * time during the wait, but the signal value returned to the application may no + * longer satisfy the condition. The application must ensure that signals in the + * group are used in such way that wait wakeup conditions are not invalidated + * before dependent threads have woken up. + * + * When this operation internally loads the value of the passed signal, it uses + * the memory order indicated in the function name. + * + * @param[in] signal_group Signal group. + * + * @param[in] conditions List of conditions. Each condition, and the value at + * the same index in @p compare_values, is used to compare the value of the + * signal at that index in @p signal_group (the signal passed by the application + * to ::hsa_signal_group_create at that particular index). The size of @p + * conditions must not be smaller than the number of signals in @p signal_group; + * any extra elements are ignored. Must not be NULL. + * + * @param[in] compare_values List of comparison values. The size of @p + * compare_values must not be smaller than the number of signals in @p + * signal_group; any extra elements are ignored. Must not be NULL. + * + * @param[in] wait_state_hint Hint used by the application to indicate the + * preferred waiting state. The actual waiting state is decided by the HSA runtime + * and may not match the provided hint. A value of ::HSA_WAIT_STATE_ACTIVE may + * improve the latency of response to a signal update by avoiding rescheduling + * overhead. + * + * @param[out] signal Signal in the group that satisfied the associated + * condition. If several signals satisfied their condition, the function can + * return any of those signals. Must not be NULL. + * + * @param[out] value Observed value for @p signal, which might no longer satisfy + * the specified condition. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p conditions is NULL, @p + * compare_values is NULL, @p signal is NULL, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_signal_group_wait_any_scacquire( + hsa_signal_group_t signal_group, + const hsa_signal_condition_t *conditions, + const hsa_signal_value_t *compare_values, + hsa_wait_state_t wait_state_hint, + hsa_signal_t *signal, + hsa_signal_value_t *value); + +/** + * @copydoc hsa_signal_group_wait_any_scacquire + */ +hsa_status_t HSA_API hsa_signal_group_wait_any_relaxed( + hsa_signal_group_t signal_group, + const hsa_signal_condition_t *conditions, + const hsa_signal_value_t *compare_values, + hsa_wait_state_t wait_state_hint, + hsa_signal_t *signal, + hsa_signal_value_t *value); + +/** @} */ + +/** \defgroup memory Memory + * @{ + */ + +/** + * @brief A memory region represents a block of virtual memory with certain + * properties. For example, the HSA runtime represents fine-grained memory in + * the global segment using a region. A region might be associated with more + * than one agent. + */ +typedef struct hsa_region_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_region_t; + +/** @} */ + + +/** \defgroup queue Queues + * @{ + */ + +/** + * @brief Queue type. Intended to be used for dynamic queue protocol + * determination. + */ +typedef enum { + /** + * Queue supports multiple producers. Use of multiproducer queue mechanics is + * required. + */ + HSA_QUEUE_TYPE_MULTI = 0, + /** + * Queue only supports a single producer. In some scenarios, the application + * may want to limit the submission of AQL packets to a single agent. Queues + * that support a single producer may be more efficient than queues supporting + * multiple producers. Use of multiproducer queue mechanics is not supported. + */ + HSA_QUEUE_TYPE_SINGLE = 1, + /** + * Queue supports multiple producers and cooperative dispatches. Cooperative + * dispatches are able to use GWS synchronization. Queues of this type may be + * limited in number. The runtime may return the same queue to serve multiple + * ::hsa_queue_create calls when this type is given. Callers must inspect the + * returned queue to discover queue size. Queues of this type are reference + * counted and require a matching number of ::hsa_queue_destroy calls to + * release. Use of multiproducer queue mechanics is required. See + * ::HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES to query agent support for this + * type. + */ + HSA_QUEUE_TYPE_COOPERATIVE = 2 +} hsa_queue_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_queue_type_t constants. + */ +typedef uint32_t hsa_queue_type32_t; + +/** + * @brief Queue features. + */ +typedef enum { + /** + * Queue supports kernel dispatch packets. + */ + HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1, + + /** + * Queue supports agent dispatch packets. + */ + HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2 +} hsa_queue_feature_t; + +/** + * @brief User mode queue. + * + * @details The queue structure is read-only and allocated by the HSA runtime, + * but agents can directly modify the contents of the buffer pointed by @a + * base_address, or use HSA runtime APIs to access the doorbell signal. + * + */ +typedef struct hsa_queue_s { + /** + * Queue type. + */ + hsa_queue_type32_t type; + + /** + * Queue features mask. This is a bit-field of ::hsa_queue_feature_t + * values. Applications should ignore any unknown set bits. + */ + uint32_t features; + +#ifdef HSA_LARGE_MODEL + void* base_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Starting address of the HSA runtime-allocated buffer used to store the AQL + * packets. Must be aligned to the size of an AQL packet. + */ + void* base_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; +#else + uint32_t reserved0; + void* base_address; +#endif + + /** + * Signal object used by the application to indicate the ID of a packet that + * is ready to be processed. The HSA runtime manages the doorbell signal. If + * the application tries to replace or destroy this signal, the behavior is + * undefined. + * + * If @a type is ::HSA_QUEUE_TYPE_SINGLE, the doorbell signal value must be + * updated in a monotonically increasing fashion. If @a type is + * ::HSA_QUEUE_TYPE_MULTI, the doorbell signal value can be updated with any + * value. + */ + hsa_signal_t doorbell_signal; + + /** + * Maximum number of packets the queue can hold. Must be a power of 2. + */ + uint32_t size; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + /** + * Queue identifier, which is unique over the lifetime of the application. + */ + uint64_t id; + +} hsa_queue_t; + +/** + * @brief Create a user mode queue. + * + * @details The HSA runtime creates the queue structure, the underlying packet + * buffer, the completion signal, and the write and read indexes. The initial + * value of the write and read indexes is 0. The type of every packet in the + * buffer is initialized to ::HSA_PACKET_TYPE_INVALID. + * + * The application should only rely on the error code returned to determine if + * the queue is valid. + * + * @param[in] agent Agent where to create the queue. + * + * @param[in] size Number of packets the queue is expected to + * hold. Must be a power of 2 between 1 and the value of + * ::HSA_AGENT_INFO_QUEUE_MAX_SIZE in @p agent. The size of the newly + * created queue is the maximum of @p size and the value of + * ::HSA_AGENT_INFO_QUEUE_MIN_SIZE in @p agent. + * + * @param[in] type Type of the queue, a bitwise OR of hsa_queue_type_t values. + * If the value of ::HSA_AGENT_INFO_QUEUE_TYPE in @p agent is ::HSA_QUEUE_TYPE_SINGLE, + * then @p type must also be ::HSA_QUEUE_TYPE_SINGLE. + * + * @param[in] callback Callback invoked by the HSA runtime for every + * asynchronous event related to the newly created queue. May be NULL. The HSA + * runtime passes three arguments to the callback: a code identifying the event + * that triggered the invocation, a pointer to the queue where the event + * originated, and the application data. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @param[in] private_segment_size Hint indicating the maximum + * expected private segment usage per work-item, in bytes. There may + * be performance degradation if the application places a kernel + * dispatch packet in the queue and the corresponding private segment + * usage exceeds @p private_segment_size. If the application does not + * want to specify any particular value for this argument, @p + * private_segment_size must be UINT32_MAX. If the queue does not + * support kernel dispatch packets, this argument is ignored. + * + * @param[in] group_segment_size Hint indicating the maximum expected + * group segment usage per work-group, in bytes. There may be + * performance degradation if the application places a kernel dispatch + * packet in the queue and the corresponding group segment usage + * exceeds @p group_segment_size. If the application does not want to + * specify any particular value for this argument, @p + * group_segment_size must be UINT32_MAX. If the queue does not + * support kernel dispatch packets, this argument is ignored. + * + * @param[out] queue Memory location where the HSA runtime stores a pointer to + * the newly created queue. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p agent does not + * support queues of the given type. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, + * @p size is 0, @p type is an invalid queue type, or @p queue is NULL. + * + */ +hsa_status_t HSA_API hsa_queue_create( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue); + +/** + * @brief Create a queue for which the application or a kernel is responsible + * for processing the AQL packets. + * + * @details The application can use this function to create queues where AQL + * packets are not parsed by the packet processor associated with an agent, + * but rather by a unit of execution running on that agent (for example, a + * thread in the host application). + * + * The application is responsible for ensuring that all the producers and + * consumers of the resulting queue can access the provided doorbell signal + * and memory region. The application is also responsible for ensuring that the + * unit of execution processing the queue packets supports the indicated + * features (AQL packet types). + * + * When the queue is created, the HSA runtime allocates the packet buffer using + * @p region, and the write and read indexes. The initial value of the write and + * read indexes is 0, and the type of every packet in the buffer is initialized + * to ::HSA_PACKET_TYPE_INVALID. The value of the @e size, @e type, @e features, + * and @e doorbell_signal fields in the returned queue match the values passed + * by the application. + * + * @param[in] region Memory region that the HSA runtime should use to allocate + * the AQL packet buffer and any other queue metadata. + * + * @param[in] size Number of packets the queue is expected to hold. Must be a + * power of 2 greater than 0. + * + * @param[in] type Queue type. + * + * @param[in] features Supported queue features. This is a bit-field of + * ::hsa_queue_feature_t values. + * + * @param[in] doorbell_signal Doorbell signal that the HSA runtime must + * associate with the returned queue. The signal handle must not be 0. + * + * @param[out] queue Memory location where the HSA runtime stores a pointer to + * the newly created queue. The application should not rely on the value + * returned for this argument but only in the status code to determine if the + * queue is valid. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, @p + * size is 0, @p type is an invalid queue type, the doorbell signal handle is + * 0, or @p queue is NULL. + * + */ +hsa_status_t HSA_API hsa_soft_queue_create( + hsa_region_t region, + uint32_t size, + hsa_queue_type32_t type, + uint32_t features, + hsa_signal_t doorbell_signal, + hsa_queue_t **queue); + +/** + * @brief Destroy a user mode queue. + * + * @details When a queue is destroyed, the state of the AQL packets that have + * not been yet fully processed (their completion phase has not finished) + * becomes undefined. It is the responsibility of the application to ensure that + * all pending queue operations are finished if their results are required. + * + * The resources allocated by the HSA runtime during queue creation (queue + * structure, ring buffer, doorbell signal) are released. The queue should not + * be accessed after being destroyed. + * + * @param[in] queue Pointer to a queue created using ::hsa_queue_create. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API hsa_queue_destroy( + hsa_queue_t *queue); + +/** + * @brief Inactivate a queue. + * + * @details Inactivating the queue aborts any pending executions and prevent any + * new packets from being processed. Any more packets written to the queue once + * it is inactivated will be ignored by the packet processor. + * + * @param[in] queue Pointer to a queue. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API hsa_queue_inactivate( + hsa_queue_t *queue); + +/** + * @deprecated Renamed as ::hsa_queue_load_read_index_scacquire. + * + * @copydoc hsa_queue_load_read_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_read_index_acquire( + const hsa_queue_t *queue); + +/** + * @brief Atomically load the read index of a queue. + * + * @param[in] queue Pointer to a queue. + * + * @return Read index of the queue pointed by @p queue. + */ +uint64_t HSA_API hsa_queue_load_read_index_scacquire( + const hsa_queue_t *queue); + +/** + * @copydoc hsa_queue_load_read_index_scacquire + */ +uint64_t HSA_API hsa_queue_load_read_index_relaxed( + const hsa_queue_t *queue); + +/** + * @deprecated Renamed as ::hsa_queue_load_write_index_scacquire. + * + * @copydoc hsa_queue_load_write_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_write_index_acquire( + const hsa_queue_t *queue); + +/** + * @brief Atomically load the write index of a queue. + * + * @param[in] queue Pointer to a queue. + * + * @return Write index of the queue pointed by @p queue. + */ +uint64_t HSA_API hsa_queue_load_write_index_scacquire( + const hsa_queue_t *queue); + +/** + * @copydoc hsa_queue_load_write_index_scacquire + */ +uint64_t HSA_API hsa_queue_load_write_index_relaxed( + const hsa_queue_t *queue); + +/** + * @brief Atomically set the write index of a queue. + * + * @details It is recommended that the application uses this function to update + * the write index when there is a single agent submitting work to the queue + * (the queue type is ::HSA_QUEUE_TYPE_SINGLE). + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to assign to the write index. + * + */ +void HSA_API hsa_queue_store_write_index_relaxed( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_store_write_index_screlease. + * + * @copydoc hsa_queue_store_write_index_screlease + */ +void HSA_API HSA_DEPRECATED hsa_queue_store_write_index_release( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_store_write_index_relaxed + */ +void HSA_API hsa_queue_store_write_index_screlease( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_cas_write_index_scacq_screl. + * + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acq_rel( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @brief Atomically set the write index of a queue if the observed value is + * equal to the expected value. The application can inspect the returned value + * to determine if the replacement was done. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] expected Expected value. + * + * @param[in] value Value to assign to the write index if @p expected matches + * the observed write index. Must be greater than @p expected. + * + * @return Previous value of the write index. + */ +uint64_t HSA_API hsa_queue_cas_write_index_scacq_screl( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_cas_write_index_scacquire. + * + * @copydoc hsa_queue_cas_write_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acquire( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_cas_write_index_scacquire( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_cas_write_index_relaxed( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_cas_write_index_screlease. + * + * @copydoc hsa_queue_cas_write_index_screlease + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_release( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_cas_write_index_screlease( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_add_write_index_scacq_screl. + * + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acq_rel( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @brief Atomically increment the write index of a queue by an offset. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to add to the write index. + * + * @return Previous value of the write index. + */ +uint64_t HSA_API hsa_queue_add_write_index_scacq_screl( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_add_write_index_scacquire. + * + * @copydoc hsa_queue_add_write_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acquire( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_add_write_index_scacquire( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_add_write_index_relaxed( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_add_write_index_screlease. + * + * @copydoc hsa_queue_add_write_index_screlease + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_release( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_add_write_index_screlease( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @brief Atomically set the read index of a queue. + * + * @details Modifications of the read index are not allowed and result in + * undefined behavior if the queue is associated with an agent for which + * only the corresponding packet processor is permitted to update the read + * index. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to assign to the read index. + * + */ +void HSA_API hsa_queue_store_read_index_relaxed( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_store_read_index_screlease. + * + * @copydoc hsa_queue_store_read_index_screlease + */ +void HSA_API HSA_DEPRECATED hsa_queue_store_read_index_release( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_store_read_index_relaxed + */ +void HSA_API hsa_queue_store_read_index_screlease( + const hsa_queue_t *queue, + uint64_t value); +/** @} */ + + +/** \defgroup aql Architected Queuing Language + * @{ + */ + +/** + * @brief Packet type. + */ +typedef enum { + /** + * Vendor-specific packet. + */ + HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0, + /** + * The packet has been processed in the past, but has not been reassigned to + * the packet processor. A packet processor must not process a packet of this + * type. All queues support this packet type. + */ + HSA_PACKET_TYPE_INVALID = 1, + /** + * Packet used by agents for dispatching jobs to kernel agents. Not all + * queues support packets of this type (see ::hsa_queue_feature_t). + */ + HSA_PACKET_TYPE_KERNEL_DISPATCH = 2, + /** + * Packet used by agents to delay processing of subsequent packets, and to + * express complex dependencies between multiple packets. All queues support + * this packet type. + */ + HSA_PACKET_TYPE_BARRIER_AND = 3, + /** + * Packet used by agents for dispatching jobs to agents. Not all + * queues support packets of this type (see ::hsa_queue_feature_t). + */ + HSA_PACKET_TYPE_AGENT_DISPATCH = 4, + /** + * Packet used by agents to delay processing of subsequent packets, and to + * express complex dependencies between multiple packets. All queues support + * this packet type. + */ + HSA_PACKET_TYPE_BARRIER_OR = 5 +} hsa_packet_type_t; + +/** + * @brief Scope of the memory fence operation associated with a packet. + */ +typedef enum { + /** + * No scope (no fence is applied). The packet relies on external fences to + * ensure visibility of memory updates. + */ + HSA_FENCE_SCOPE_NONE = 0, + /** + * The fence is applied with agent scope for the global segment. + */ + HSA_FENCE_SCOPE_AGENT = 1, + /** + * The fence is applied across both agent and system scope for the global + * segment. + */ + HSA_FENCE_SCOPE_SYSTEM = 2 +} hsa_fence_scope_t; + +/** + * @brief Sub-fields of the @a header field that is present in any AQL + * packet. The offset (with respect to the address of @a header) of a sub-field + * is identical to its enumeration constant. The width of each sub-field is + * determined by the corresponding value in ::hsa_packet_header_width_t. The + * offset and the width are expressed in bits. + */ + typedef enum { + /** + * Packet type. The value of this sub-field must be one of + * ::hsa_packet_type_t. If the type is ::HSA_PACKET_TYPE_VENDOR_SPECIFIC, the + * packet layout is vendor-specific. + */ + HSA_PACKET_HEADER_TYPE = 0, + /** + * Barrier bit. If the barrier bit is set, the processing of the current + * packet only launches when all preceding packets (within the same queue) are + * complete. + */ + HSA_PACKET_HEADER_BARRIER = 8, + /** + * Acquire fence scope. The value of this sub-field determines the scope and + * type of the memory fence operation applied before the packet enters the + * active phase. An acquire fence ensures that any subsequent global segment + * or image loads by any unit of execution that belongs to a dispatch that has + * not yet entered the active phase on any queue of the same kernel agent, + * sees any data previously released at the scopes specified by the acquire + * fence. The value of this sub-field must be one of ::hsa_fence_scope_t. + */ + HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE = 9, + /** + * @deprecated Renamed as ::HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9, + /** + * Release fence scope, The value of this sub-field determines the scope and + * type of the memory fence operation applied after kernel completion but + * before the packet is completed. A release fence makes any global segment or + * image data that was stored by any unit of execution that belonged to a + * dispatch that has completed the active phase on any queue of the same + * kernel agent visible in all the scopes specified by the release fence. The + * value of this sub-field must be one of ::hsa_fence_scope_t. + */ + HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE = 11, + /** + * @deprecated Renamed as ::HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11 + } hsa_packet_header_t; + +/** + * @brief Width (in bits) of the sub-fields in ::hsa_packet_header_t. + */ + typedef enum { + HSA_PACKET_HEADER_WIDTH_TYPE = 8, + HSA_PACKET_HEADER_WIDTH_BARRIER = 1, + HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE = 2, + /** + * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_WIDTH_ACQUIRE_FENCE_SCOPE = 2, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE = 2, + /** + * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_WIDTH_RELEASE_FENCE_SCOPE = 2 + } hsa_packet_header_width_t; + +/** + * @brief Sub-fields of the kernel dispatch packet @a setup field. The offset + * (with respect to the address of @a setup) of a sub-field is identical to its + * enumeration constant. The width of each sub-field is determined by the + * corresponding value in ::hsa_kernel_dispatch_packet_setup_width_t. The + * offset and the width are expressed in bits. + */ + typedef enum { + /** + * Number of dimensions of the grid. Valid values are 1, 2, or 3. + * + */ + HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0 + } hsa_kernel_dispatch_packet_setup_t; + +/** + * @brief Width (in bits) of the sub-fields in + * ::hsa_kernel_dispatch_packet_setup_t. + */ + typedef enum { + HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS = 2 + } hsa_kernel_dispatch_packet_setup_width_t; + +/** + * @brief AQL kernel dispatch packet + */ +typedef struct hsa_kernel_dispatch_packet_s { + union { + struct { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Dispatch setup parameters. Used to configure kernel dispatch parameters + * such as the number of dimensions in the grid. The parameters are described + * by ::hsa_kernel_dispatch_packet_setup_t. + */ + uint16_t setup; + }; + uint32_t full_header; + }; + + /** + * X dimension of work-group, in work-items. Must be greater than 0. + */ + uint16_t workgroup_size_x; + + /** + * Y dimension of work-group, in work-items. Must be greater than + * 0. If the grid has 1 dimension, the only valid value is 1. + */ + uint16_t workgroup_size_y; + + /** + * Z dimension of work-group, in work-items. Must be greater than + * 0. If the grid has 1 or 2 dimensions, the only valid value is 1. + */ + uint16_t workgroup_size_z; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * X dimension of grid, in work-items. Must be greater than 0. Must + * not be smaller than @a workgroup_size_x. + */ + uint32_t grid_size_x; + + /** + * Y dimension of grid, in work-items. Must be greater than 0. If the grid has + * 1 dimension, the only valid value is 1. Must not be smaller than @a + * workgroup_size_y. + */ + uint32_t grid_size_y; + + /** + * Z dimension of grid, in work-items. Must be greater than 0. If the grid has + * 1 or 2 dimensions, the only valid value is 1. Must not be smaller than @a + * workgroup_size_z. + */ + uint32_t grid_size_z; + + /** + * Size in bytes of private memory allocation request (per work-item). + */ + uint32_t private_segment_size; + + /** + * Size in bytes of group memory allocation request (per work-group). Must not + * be less than the sum of the group memory used by the kernel (and the + * functions it calls directly or indirectly) and the dynamically allocated + * group segment variables. + */ + uint32_t group_segment_size; + + /** + * Opaque handle to a code object that includes an implementation-defined + * executable code for the kernel. + */ + uint64_t kernel_object; + +#ifdef HSA_LARGE_MODEL + void* kernarg_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Pointer to a buffer containing the kernel arguments. May be NULL. + * + * The buffer must be allocated using ::hsa_memory_allocate, and must not be + * modified once the kernel dispatch packet is enqueued until the dispatch has + * completed execution. + */ + void* kernarg_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; +#else + uint32_t reserved1; + void* kernarg_address; +#endif + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_kernel_dispatch_packet_t; + +/** + * @brief Agent dispatch packet. + */ +typedef struct hsa_agent_dispatch_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Application-defined function to be performed by the destination agent. + */ + uint16_t type; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; + +#ifdef HSA_LARGE_MODEL + void* return_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Address where to store the function return values, if any. + */ + void* return_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; +#else + uint32_t reserved1; + void* return_address; +#endif + + /** + * Function arguments. + */ + uint64_t arg[4]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_agent_dispatch_packet_t; + +/** + * @brief Barrier-AND packet. + */ +typedef struct hsa_barrier_and_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Array of dependent signal objects. Signals with a handle value of 0 are + * allowed and are interpreted by the packet processor as satisfied + * dependencies. + */ + hsa_signal_t dep_signal[5]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_barrier_and_packet_t; + +/** + * @brief Barrier-OR packet. + */ +typedef struct hsa_barrier_or_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Array of dependent signal objects. Signals with a handle value of 0 are + * allowed and are interpreted by the packet processor as dependencies not + * satisfied. + */ + hsa_signal_t dep_signal[5]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_barrier_or_packet_t; + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Memory segments associated with a region. + */ +typedef enum { + /** + * Global segment. Used to hold data that is shared by all agents. + */ + HSA_REGION_SEGMENT_GLOBAL = 0, + /** + * Read-only segment. Used to hold data that remains constant during the + * execution of a kernel. + */ + HSA_REGION_SEGMENT_READONLY = 1, + /** + * Private segment. Used to hold data that is local to a single work-item. + */ + HSA_REGION_SEGMENT_PRIVATE = 2, + /** + * Group segment. Used to hold data that is shared by the work-items of a + * work-group. + */ + HSA_REGION_SEGMENT_GROUP = 3, + /** + * Kernarg segment. Used to store kernel arguments. + */ + HSA_REGION_SEGMENT_KERNARG = 4 +} hsa_region_segment_t; + +/** + * @brief Global region flags. + */ +typedef enum { + /** + * The application can use memory in the region to store kernel arguments, and + * provide the values for the kernarg segment of a kernel dispatch. If this + * flag is set, then ::HSA_REGION_GLOBAL_FLAG_FINE_GRAINED must be set. + */ + HSA_REGION_GLOBAL_FLAG_KERNARG = 1, + /** + * Updates to memory in this region are immediately visible to all the + * agents under the terms of the HSA memory model. If this + * flag is set, then ::HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED must not be set. + */ + HSA_REGION_GLOBAL_FLAG_FINE_GRAINED = 2, + /** + * Updates to memory in this region can be performed by a single agent at + * a time. If a different agent in the system is allowed to access the + * region, the application must explicitely invoke ::hsa_memory_assign_agent + * in order to transfer ownership to that agent for a particular buffer. + */ + HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4, + + /** + * Updates to memory in this region have extended scope, where the device-scope atomics + * to this memory type act as system-scope with respect to all variables located in + * memory regions of this type. + * Note: On non-compliant systems, the application may still be responsible for performing + * device-specific actions necessary to achieve system-scope coherence. + */ + HSA_REGION_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8 +} hsa_region_global_flag_t; + +/** + * @brief Attributes of a memory region. + */ + +#ifdef __cplusplus +typedef enum : int { +#else +typedef enum { +#endif + /** + * Segment where memory in the region can be used. The type of this + * attribute is ::hsa_region_segment_t. + */ + HSA_REGION_INFO_SEGMENT = 0, + /** + * Flag mask. The value of this attribute is undefined if the value of + * ::HSA_REGION_INFO_SEGMENT is not ::HSA_REGION_SEGMENT_GLOBAL. The type of + * this attribute is uint32_t, a bit-field of ::hsa_region_global_flag_t + * values. + */ + HSA_REGION_INFO_GLOBAL_FLAGS = 1, + /** + * Size of this region, in bytes. The type of this attribute is size_t. + */ + HSA_REGION_INFO_SIZE = 2, + /** + * Maximum allocation size in this region, in bytes. Must not exceed the value + * of ::HSA_REGION_INFO_SIZE. The type of this attribute is size_t. + * + * If the region is in the global or readonly segments, this is the maximum + * size that the application can pass to ::hsa_memory_allocate. + * + * If the region is in the group segment, this is the maximum size (per + * work-group) that can be requested for a given kernel dispatch. If the + * region is in the private segment, this is the maximum size (per work-item) + * that can be requested for a specific kernel dispatch, and must be at least + * 256 bytes. + */ + HSA_REGION_INFO_ALLOC_MAX_SIZE = 4, + /** + * Maximum size (per work-group) of private memory that can be requested for a + * specific kernel dispatch. Must be at least 65536 bytes. The type of this + * attribute is uint32_t. The value of this attribute is undefined if the + * region is not in the private segment. + */ + HSA_REGION_INFO_ALLOC_MAX_PRIVATE_WORKGROUP_SIZE = 8, + /** + * Indicates whether memory in this region can be allocated using + * ::hsa_memory_allocate. The type of this attribute is bool. + * + * The value of this flag is always false for regions in the group and private + * segments. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED = 5, + /** + * Allocation granularity of buffers allocated by ::hsa_memory_allocate in + * this region. The size of a buffer allocated in this region is a multiple of + * the value of this attribute. The value of this attribute is only defined if + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region. The type + * of this attribute is size_t. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE = 6, + /** + * Alignment of buffers allocated by ::hsa_memory_allocate in this region. The + * value of this attribute is only defined if + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region, and must be + * a power of 2. The type of this attribute is size_t. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT = 7 +} hsa_region_info_t; + +/** + * @brief Get the current value of an attribute of a region. + * + * @param[in] region A valid region. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * region attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_region_get_info( + hsa_region_t region, + hsa_region_info_t attribute, + void* value); + +/** + * @brief Iterate over the memory regions associated with a given agent, and + * invoke an application-defined callback on every iteration. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per region that is + * accessible from the agent. The HSA runtime passes two arguments to the + * callback, the region and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and ::hsa_agent_iterate_regions returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_regions( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data); + +/** + * @brief Allocate a block of memory in a given region. + * + * @param[in] region Region where to allocate memory from. The region must have + * the ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED flag set. + * + * @param[in] size Allocation size, in bytes. Must not be zero. This value is + * rounded up to the nearest multiple of ::HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE + * in @p region. + * + * @param[out] ptr Pointer to the location where to store the base address of + * the allocated block. The returned base address is aligned to the value of + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT in @p region. If the allocation + * fails, the returned value is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to + * allocate memory in @p region, or @p size is greater than the value of + * HSA_REGION_INFO_ALLOC_MAX_SIZE in @p region. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0. + */ +hsa_status_t HSA_API hsa_memory_allocate(hsa_region_t region, + size_t size, + void** ptr); + +/** + * @brief Deallocate a block of memory previously allocated using + * ::hsa_memory_allocate. + * + * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value + * previously returned by ::hsa_memory_allocate, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + */ +hsa_status_t HSA_API hsa_memory_free(void* ptr); + +/** + * @brief Copy a block of memory from the location pointed to by @p src to the + * memory block pointed to by @p dst. + * + * @param[out] dst Buffer where the content is to be copied. If @p dst is in + * coarse-grained memory, the copied data is only visible to the agent currently + * assigned (::hsa_memory_assign_agent) to @p dst. + * + * @param[in] src A valid pointer to the source of data to be copied. The source + * buffer must not overlap with the destination buffer. If the source buffer is + * in coarse-grained memory then it must be assigned to an agent, from which the + * data will be retrieved. + * + * @param[in] size Number of bytes to copy. If @p size is 0, no copy is + * performed and the function returns success. Copying a number of bytes larger + * than the size of the buffers pointed by @p dst or @p src results in undefined + * behavior. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination + * pointers are NULL. + */ +hsa_status_t HSA_API hsa_memory_copy( + void *dst, + const void *src, + size_t size); + +/** + * @brief Change the ownership of a global, coarse-grained buffer. + * + * @details The contents of a coarse-grained buffer are visible to an agent + * only after ownership has been explicitely transferred to that agent. Once the + * operation completes, the previous owner cannot longer access the data in the + * buffer. + * + * An implementation of the HSA runtime is allowed, but not required, to change + * the physical location of the buffer when ownership is transferred to a + * different agent. In general the application must not assume this + * behavior. The virtual location (address) of the passed buffer is never + * modified. + * + * @param[in] ptr Base address of a global buffer. The pointer must match an + * address previously returned by ::hsa_memory_allocate. The size of the buffer + * affected by the ownership change is identical to the size of that previous + * allocation. If @p ptr points to a fine-grained global buffer, no operation is + * performed and the function returns success. If @p ptr does not point to + * global memory, the behavior is undefined. + * + * @param[in] agent Agent that becomes the owner of the buffer. The + * application is responsible for ensuring that @p agent has access to the + * region that contains the buffer. It is allowed to change ownership to an + * agent that is already the owner of the buffer, with the same or different + * access permissions. + * + * @param[in] access Access permissions requested for the new owner. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p access is + * not a valid access value. + */ +hsa_status_t HSA_API hsa_memory_assign_agent( + void *ptr, + hsa_agent_t agent, + hsa_access_permission_t access); + +/** + * + * @brief Register a global, fine-grained buffer. + * + * @details Registering a buffer serves as an indication to the HSA runtime that + * the memory might be accessed from a kernel agent other than the + * host. Registration is a performance hint that allows the HSA runtime + * implementation to know which buffers will be accessed by some of the kernel + * agents ahead of time. + * + * Registration is only recommended for buffers in the global segment that have + * not been allocated using the HSA allocator (::hsa_memory_allocate), but an OS + * allocator instead. Registering an OS-allocated buffer in the base profile is + * equivalent to a no-op. + * + * Registrations should not overlap. + * + * @param[in] ptr A buffer in global, fine-grained memory. If a NULL pointer is + * passed, no operation is performed. If the buffer has been allocated using + * ::hsa_memory_allocate, or has already been registered, no operation is + * performed. + * + * @param[in] size Requested registration size in bytes. A size of 0 is + * only allowed if @p ptr is NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 but @p ptr + * is not NULL. + */ +hsa_status_t HSA_API hsa_memory_register( + void *ptr, + size_t size); + +/** + * + * @brief Deregister memory previously registered using ::hsa_memory_register. + * + * @details If the memory interval being deregistered does not match a previous + * registration (start and end addresses), the behavior is undefined. + * + * @param[in] ptr A pointer to the base of the buffer to be deregistered. If + * a NULL pointer is passed, no operation is performed. + * + * @param[in] size Size of the buffer to be deregistered. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_memory_deregister( + void *ptr, + size_t size); + +/** @} */ + + +/** \defgroup instruction-set-architecture Instruction Set Architecture. + * @{ + */ + +/** + * @brief Instruction set architecture. + */ +typedef struct hsa_isa_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_isa_t; + +/** + * @brief Retrieve a reference to an instruction set architecture handle out of + * a symbolic name. + * + * @param[in] name Vendor-specific name associated with a a particular + * instruction set architecture. @p name must start with the vendor name and a + * colon (for example, "AMD:"). The rest of the name is vendor-specific. Must be + * a NUL-terminated string. + * + * @param[out] isa Memory location where the HSA runtime stores the ISA handle + * corresponding to the given name. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA_NAME The given name does not + * correspond to any instruction set architecture. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p name is NULL, or @p isa is + * NULL. + */ +hsa_status_t HSA_API hsa_isa_from_name( + const char *name, + hsa_isa_t *isa); + +/** + * @brief Iterate over the instruction sets supported by the given agent, and + * invoke an application-defined callback on every iteration. The iterator is + * deterministic: if an agent supports several instruction set architectures, + * they are traversed in the same order in every invocation of this function. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per instruction set + * architecture. The HSA runtime passes two arguments to the callback: the + * ISA and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * that status value is returned. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_isas( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_isa_t isa, void *data), + void *data); + +/** + * @brief Instruction set architecture attributes. + */ +typedef enum { + /** + * The length of the ISA name in bytes, not including the NUL terminator. The + * type of this attribute is uint32_t. + */ + HSA_ISA_INFO_NAME_LENGTH = 0, + /** + * Human-readable description. The type of this attribute is character array + * with the length equal to the value of ::HSA_ISA_INFO_NAME_LENGTH attribute. + */ + HSA_ISA_INFO_NAME = 1, + /** + * @deprecated + * + * Number of call conventions supported by the instruction set architecture. + * Must be greater than zero. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_COUNT = 2, + /** + * @deprecated + * + * Number of work-items in a wavefront for a given call convention. Must be a + * power of 2 in the range [1,256]. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONT_SIZE = 3, + /** + * @deprecated + * + * Number of wavefronts per compute unit for a given call convention. In + * practice, other factors (for example, the amount of group memory used by a + * work-group) may further limit the number of wavefronts per compute + * unit. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONTS_PER_COMPUTE_UNIT = 4, + /** + * Machine models supported by the instruction set architecture. The type of + * this attribute is a bool[2]. If the ISA supports the small machine model, + * the element at index ::HSA_MACHINE_MODEL_SMALL is true. If the ISA supports + * the large model, the element at index ::HSA_MACHINE_MODEL_LARGE is true. + */ + HSA_ISA_INFO_MACHINE_MODELS = 5, + /** + * Profiles supported by the instruction set architecture. The type of this + * attribute is a bool[2]. If the ISA supports the base profile, the element + * at index ::HSA_PROFILE_BASE is true. If the ISA supports the full profile, + * the element at index ::HSA_PROFILE_FULL is true. + */ + HSA_ISA_INFO_PROFILES = 6, + /** + * Default floating-point rounding modes supported by the instruction set + * architecture. The type of this attribute is a bool[3]. The value at a given + * index is true if the corresponding rounding mode in + * ::hsa_default_float_rounding_mode_t is supported. At least one default mode + * has to be supported. + * + * If the default mode is supported, then + * ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES must report that + * both the zero and the near roundings modes are supported. + */ + HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES = 7, + /** + * Default floating-point rounding modes supported by the instruction set + * architecture in the Base profile. The type of this attribute is a + * bool[3]. The value at a given index is true if the corresponding rounding + * mode in ::hsa_default_float_rounding_mode_t is supported. The value at + * index HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT must be false. At least one + * of the values at indexes ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO or + * HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR must be true. + */ + HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 8, + /** + * Flag indicating that the f16 HSAIL operation is at least as fast as the + * f32 operation in the instruction set architecture. The type of this + * attribute is bool. + */ + HSA_ISA_INFO_FAST_F16_OPERATION = 9, + /** + * Maximum number of work-items of each dimension of a work-group. Each + * maximum must be greater than 0. No maximum can exceed the value of + * ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE. The type of this attribute is + * uint16_t[3]. + */ + HSA_ISA_INFO_WORKGROUP_MAX_DIM = 12, + /** + * Maximum total number of work-items in a work-group. The type + * of this attribute is uint32_t. + */ + HSA_ISA_INFO_WORKGROUP_MAX_SIZE = 13, + /** + * Maximum number of work-items of each dimension of a grid. Each maximum must + * be greater than 0, and must not be smaller than the corresponding value in + * ::HSA_ISA_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of + * ::HSA_ISA_INFO_GRID_MAX_SIZE. The type of this attribute is + * ::hsa_dim3_t. + */ + HSA_ISA_INFO_GRID_MAX_DIM = 14, + /** + * Maximum total number of work-items in a grid. The type of this + * attribute is uint64_t. + */ + HSA_ISA_INFO_GRID_MAX_SIZE = 16, + /** + * Maximum number of fbarriers per work-group. Must be at least 32. The + * type of this attribute is uint32_t. + */ + HSA_ISA_INFO_FBARRIER_MAX_SIZE = 17 +} hsa_isa_info_t; + +/** + * @deprecated The concept of call convention has been deprecated. If the + * application wants to query the value of an attribute for a given instruction + * set architecture, use ::hsa_isa_get_info_alt instead. If the application + * wants to query an attribute that is specific to a given combination of ISA + * and wavefront, use ::hsa_wavefront_get_info. + * + * @brief Get the current value of an attribute for a given instruction set + * architecture (ISA). + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] attribute Attribute to query. + * + * @param[in] index Call convention index. Used only for call convention + * attributes, otherwise ignored. Must have a value between 0 (inclusive) and + * the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT (not + * inclusive) in @p isa. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_INDEX The index is out of range. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is + * NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_get_info( + hsa_isa_t isa, + hsa_isa_info_t attribute, + uint32_t index, + void *value); + +/** + * @brief Get the current value of an attribute for a given instruction set + * architecture (ISA). + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is + * NULL. + */ +hsa_status_t HSA_API hsa_isa_get_info_alt( + hsa_isa_t isa, + hsa_isa_info_t attribute, + void *value); + +/** + * @brief Retrieve the exception policy support for a given combination of + * instruction set architecture and profile. + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] profile Profile. + * + * @param[out] mask Pointer to a memory location where the HSA runtime stores a + * mask of ::hsa_exception_policy_t values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid + * profile, or @p mask is NULL. + */ +hsa_status_t HSA_API hsa_isa_get_exception_policies( + hsa_isa_t isa, + hsa_profile_t profile, + uint16_t *mask); + +/** + * @brief Floating-point types. + */ +typedef enum { + /** + * 16-bit floating-point type. + */ + HSA_FP_TYPE_16 = 1, + /** + * 32-bit floating-point type. + */ + HSA_FP_TYPE_32 = 2, + /** + * 64-bit floating-point type. + */ + HSA_FP_TYPE_64 = 4 +} hsa_fp_type_t; + +/** + * @brief Flush to zero modes. + */ +typedef enum { + /** + * Flush to zero. + */ + HSA_FLUSH_MODE_FTZ = 1, + /** + * Do not flush to zero. + */ + HSA_FLUSH_MODE_NON_FTZ = 2 +} hsa_flush_mode_t; + +/** + * @brief Round methods. + */ +typedef enum { + /** + * Single round method. + */ + HSA_ROUND_METHOD_SINGLE = 1, + /** + * Double round method. + */ + HSA_ROUND_METHOD_DOUBLE = 2 +} hsa_round_method_t; + +/** + * @brief Retrieve the round method (single or double) used to implement the + * floating-point multiply add instruction (mad) for a given combination of + * instruction set architecture, floating-point type, and flush to zero + * modifier. + * + * @param[in] isa Instruction set architecture. + * + * @param[in] fp_type Floating-point type. + * + * @param[in] flush_mode Flush to zero modifier. + * + * @param[out] round_method Pointer to a memory location where the HSA + * runtime stores the round method used by the implementation. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fp_type is not a valid + * floating-point type, or @p flush_mode is not a valid flush to zero modifier, + * or @p round_method is NULL. + */ +hsa_status_t HSA_API hsa_isa_get_round_method( + hsa_isa_t isa, + hsa_fp_type_t fp_type, + hsa_flush_mode_t flush_mode, + hsa_round_method_t *round_method); + +/** + * @brief Wavefront handle + */ +typedef struct hsa_wavefront_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_wavefront_t; + +/** + * @brief Wavefront attributes. + */ +typedef enum { + /** + * Number of work-items in the wavefront. Must be a power of 2 in the range + * [1,256]. The type of this attribute is uint32_t. + */ + HSA_WAVEFRONT_INFO_SIZE = 0 +} hsa_wavefront_info_t; + +/** + * @brief Get the current value of a wavefront attribute. + * + * @param[in] wavefront A wavefront. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_WAVEFRONT The wavefront is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * wavefront attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_wavefront_get_info( + hsa_wavefront_t wavefront, + hsa_wavefront_info_t attribute, + void *value); + +/** + * @brief Iterate over the different wavefronts supported by an instruction set + * architecture, and invoke an application-defined callback on every iteration. + * + * @param[in] isa Instruction set architecture. + * + * @param[in] callback Callback to be invoked once per wavefront that is + * supported by the agent. The HSA runtime passes two arguments to the callback: + * the wavefront handle and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and that value is returned. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_isa_iterate_wavefronts( + hsa_isa_t isa, + hsa_status_t (*callback)(hsa_wavefront_t wavefront, void *data), + void *data); + +/** + * @deprecated Use ::hsa_agent_iterate_isas to query which instructions set + * architectures are supported by a given agent. + * + * @brief Check if the instruction set architecture of a code object can be + * executed on an agent associated with another architecture. + * + * @param[in] code_object_isa Instruction set architecture associated with a + * code object. + * + * @param[in] agent_isa Instruction set architecture associated with an agent. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. If the two architectures are compatible, the result + * is true; if they are incompatible, the result is false. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p code_object_isa or @p agent_isa are + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_compatible( + hsa_isa_t code_object_isa, + hsa_isa_t agent_isa, + bool *result); + +/** @} */ + + +/** \defgroup executable Executable + * @{ + */ + +/** + * @brief Code object reader handle. A code object reader is used to + * load a code object from file (when created using + * ::hsa_code_object_reader_create_from_file), or from memory (if created using + * ::hsa_code_object_reader_create_from_memory). + */ +typedef struct hsa_code_object_reader_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_code_object_reader_t; + +/** + * @brief Create a code object reader to operate on a file. + * + * @param[in] file File descriptor. The file must have been opened by + * application with at least read permissions prior calling this function. The + * file must contain a vendor-specific code object. + * + * The file is owned and managed by the application; the lifetime of the file + * descriptor must exceed that of any associated code object reader. + * + * @param[out] code_object_reader Memory location to store the newly created + * code object reader handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL. + */ +hsa_status_t HSA_API hsa_code_object_reader_create_from_file( + hsa_file_t file, + hsa_code_object_reader_t *code_object_reader); + +/** + * @brief Create a code object reader to operate on memory. + * + * @param[in] code_object Memory buffer that contains a vendor-specific code + * object. The buffer is owned and managed by the application; the lifetime of + * the buffer must exceed that of any associated code object reader. + * + * @param[in] size Size of the buffer pointed to by @p code_object. Must not be + * 0. + * + * @param[out] code_object_reader Memory location to store newly created code + * object reader handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object is NULL, @p size + * is zero, or @p code_object_reader is NULL. + */ +hsa_status_t HSA_API hsa_code_object_reader_create_from_memory( + const void *code_object, + size_t size, + hsa_code_object_reader_t *code_object_reader); + +/** + * @brief Destroy a code object reader. + * + * @details The code object reader handle becomes invalid after completion of + * this function. Any file or memory used to create the code object read is not + * closed, removed, or deallocated by this function. + * + * @param[in] code_object_reader Code object reader to destroy. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader + * is invalid. + */ +hsa_status_t HSA_API hsa_code_object_reader_destroy( + hsa_code_object_reader_t code_object_reader); + +/** + * @brief Struct containing an opaque handle to an executable, which contains + * ISA for finalized kernels and indirect functions together with the allocated + * global or readonly segment variables they reference. + */ +typedef struct hsa_executable_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_executable_t; + +/** + * @brief Executable state. + */ +typedef enum { + /** + * Executable state, which allows the user to load code objects and define + * external variables. Variable addresses, kernel code handles, and + * indirect function code handles are not available in query operations until + * the executable is frozen (zero always returned). + */ + HSA_EXECUTABLE_STATE_UNFROZEN = 0, + /** + * Executable state, which allows the user to query variable addresses, + * kernel code handles, and indirect function code handles using query + * operations. Loading new code objects, as well as defining external + * variables, is not allowed in this state. + */ + HSA_EXECUTABLE_STATE_FROZEN = 1 +} hsa_executable_state_t; + +/** + * @deprecated Use ::hsa_executable_create_alt instead, which allows the + * application to specify the default floating-point rounding mode of the + * executable and assumes an unfrozen initial state. + * + * @brief Create an empty executable. + * + * @param[in] profile Profile used in the executable. + * + * @param[in] executable_state Executable state. If the state is + * ::HSA_EXECUTABLE_STATE_FROZEN, the resulting executable is useless because no + * code objects can be loaded, and no variables can be defined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] executable Memory location where the HSA runtime stores the newly + * created executable handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or + * @p executable is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_create( + hsa_profile_t profile, + hsa_executable_state_t executable_state, + const char *options, + hsa_executable_t *executable); + +/** + * @brief Create an empty executable. + * + * @param[in] profile Profile used in the executable. + * + * @param[in] default_float_rounding_mode Default floating-point rounding mode + * used in the executable. Allowed rounding modes are near and zero (default is + * not allowed). + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] executable Memory location where the HSA runtime stores newly + * created executable handle. The initial state of the executable is + * ::HSA_EXECUTABLE_STATE_UNFROZEN. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or + * @p executable is NULL. + */ +hsa_status_t HSA_API hsa_executable_create_alt( + hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, + hsa_executable_t *executable); + +/** + * @brief Destroy an executable. + * + * @details An executable handle becomes invalid after the executable has been + * destroyed. Code object handles that were loaded into this executable are + * still valid after the executable has been destroyed, and can be used as + * intended. Resources allocated outside and associated with this executable + * (such as external global or readonly variables) can be released after the + * executable has been destroyed. + * + * Executable should not be destroyed while kernels are in flight. + * + * @param[in] executable Executable. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + */ +hsa_status_t HSA_API hsa_executable_destroy( + hsa_executable_t executable); + +/** + * @brief Loaded code object handle. + */ +typedef struct hsa_loaded_code_object_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_loaded_code_object_t; + +/** + * @brief Load a program code object into an executable. + * + * @details A program code object contains information about resources that are + * accessible by all kernel agents that run the executable, and can be loaded + * at most once into an executable. + * + * If the program code object uses extensions, the implementation must support + * them for this operation to return successfully. + * + * @param[in] executable Executable. + * + * @param[in] code_object_reader A code object reader that holds the program + * code object to load. If a code object reader is destroyed before all the + * associated executables are destroyed, the behavior is undefined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] loaded_code_object Pointer to a memory location where the HSA + * runtime stores the loaded code object handle. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader + * is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The program code object is + * not compatible with the executable or the implementation (for example, the + * code object uses an extension that is not supported by the implementation). + */ +hsa_status_t HSA_API hsa_executable_load_program_code_object( + hsa_executable_t executable, + hsa_code_object_reader_t code_object_reader, + const char *options, + hsa_loaded_code_object_t *loaded_code_object); + +/** + * @brief Load an agent code object into an executable. + * + * @details The agent code object contains all defined agent + * allocation variables, functions, indirect functions, and kernels in a given + * program for a given instruction set architecture. + * + * Any module linkage declaration must have been defined either by a define + * variable or by loading a code object that has a symbol with module linkage + * definition. + * + * The default floating-point rounding mode of the code object associated with + * @p code_object_reader must match that of the executable + * (::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE), or be default (in which + * case the value of ::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE is used). + * If the agent code object uses extensions, the implementation and the agent + * must support them for this operation to return successfully. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent to load code object for. A code object can be loaded + * into an executable at most once for a given agent. The instruction set + * architecture of the code object must be supported by the agent. + * + * @param[in] code_object_reader A code object reader that holds the code object + * to load. If a code object reader is destroyed before all the associated + * executables are destroyed, the behavior is undefined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] loaded_code_object Pointer to a memory location where the HSA + * runtime stores the loaded code object handle. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader + * is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The code object read by @p + * code_object_reader is not compatible with the agent (for example, the agent + * does not support the instruction set architecture of the code object), the + * executable (for example, there is a default floating-point mode mismatch + * between the two), or the implementation. + */ +hsa_status_t HSA_API hsa_executable_load_agent_code_object( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_reader_t code_object_reader, + const char *options, + hsa_loaded_code_object_t *loaded_code_object); + +/** + * @brief Freeze the executable. + * + * @details No modifications to executable can be made after freezing: no code + * objects can be loaded to the executable, and no external variables can be + * defined. Freezing the executable does not prevent querying the executable's + * attributes. The application must define all the external variables in an + * executable before freezing it. + * + * @param[in] executable Executable. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_UNDEFINED One or more variables are + * undefined in the executable. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is already frozen. + */ +hsa_status_t HSA_API hsa_executable_freeze( + hsa_executable_t executable, + const char *options); + +/** + * @brief Executable attributes. + */ +typedef enum { + /** + * Profile this executable is created for. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_EXECUTABLE_INFO_PROFILE = 1, + /** + * Executable state. The type of this attribute is ::hsa_executable_state_t. + */ + HSA_EXECUTABLE_INFO_STATE = 2, + /** + * Default floating-point rounding mode specified when executable was created. + * The type of this attribute is ::hsa_default_float_rounding_mode_t. + */ + HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 3 +} hsa_executable_info_t; + +/** + * @brief Get the current value of an attribute for a given executable. + * + * @param[in] executable Executable. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * executable attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_executable_get_info( + hsa_executable_t executable, + hsa_executable_info_t attribute, + void *value); + +/** + * @brief Define an external global variable with program allocation. + * + * @details This function allows the application to provide the definition + * of a variable in the global segment memory with program allocation. The + * variable must be defined before loading a code object into an executable. + * In addition, code objects loaded must not define the variable. + * + * @param[in] executable Executable. Must not be in frozen state. + * + * @param[in] variable_name Name of the variable. The Programmer's Reference + * Manual describes the standard name mangling scheme. + * + * @param[in] address Address where the variable is defined. This address must + * be in global memory and can be read and written by any agent in the + * system. The application cannot deallocate the buffer pointed by @p address + * before @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + */ +hsa_status_t HSA_API hsa_executable_global_variable_define( + hsa_executable_t executable, + const char *variable_name, + void *address); + +/** + * @brief Define an external global variable with agent allocation. + * + * @details This function allows the application to provide the definition + * of a variable in the global segment memory with agent allocation. The + * variable must be defined before loading a code object into an executable. + * In addition, code objects loaded must not define the variable. + * + * @param[in] executable Executable. Must not be in frozen state. + * + * @param[in] agent Agent for which the variable is being defined. + * + * @param[in] variable_name Name of the variable. The Programmer's Reference + * Manual describes the standard name mangling scheme. + * + * @param[in] address Address where the variable is defined. This address must + * have been previously allocated using ::hsa_memory_allocate in a global region + * that is only visible to @p agent. The application cannot deallocate the + * buffer pointed by @p address before @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + */ +hsa_status_t HSA_API hsa_executable_agent_global_variable_define( + hsa_executable_t executable, + hsa_agent_t agent, + const char *variable_name, + void *address); + +/** + * @brief Define an external readonly variable. + * + * @details This function allows the application to provide the definition + * of a variable in the readonly segment memory. The variable must be defined + * before loading a code object into an executable. In addition, code objects + * loaded must not define the variable. + * + * @param[in] executable Executable. Must not be in frozen state. + * + * @param[in] agent Agent for which the variable is being defined. + * + * @param[in] variable_name Name of the variable. The Programmer's Reference + * Manual describes the standard name mangling scheme. + * + * @param[in] address Address where the variable is defined. This address must + * have been previously allocated using ::hsa_memory_allocate in a readonly + * region associated with @p agent. The application cannot deallocate the buffer + * pointed by @p address before @p executable is destroyed. + * + * @param[in] address Address where the variable is defined. The buffer pointed + * by @p address is owned by the application, and cannot be deallocated before + * @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE Executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + */ +hsa_status_t HSA_API hsa_executable_readonly_variable_define( + hsa_executable_t executable, + hsa_agent_t agent, + const char *variable_name, + void *address); + +/** + * @brief Validate an executable. Checks that all code objects have matching + * machine model, profile, and default floating-point rounding mode. Checks that + * all declarations have definitions. Checks declaration-definition + * compatibility (see the HSA Programming Reference Manual for compatibility + * rules). Invoking this function is equivalent to invoking + * ::hsa_executable_validate_alt with no options. + * + * @param[in] executable Executable. Must be in frozen state. + * + * @param[out] result Memory location where the HSA runtime stores the + * validation result. If the executable passes validation, the result is 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API hsa_executable_validate( + hsa_executable_t executable, + uint32_t *result); + +/** + * @brief Validate an executable. Checks that all code objects have matching + * machine model, profile, and default floating-point rounding mode. Checks that + * all declarations have definitions. Checks declaration-definition + * compatibility (see the HSA Programming Reference Manual for compatibility + * rules). + * + * @param[in] executable Executable. Must be in frozen state. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] result Memory location where the HSA runtime stores the + * validation result. If the executable passes validation, the result is 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API hsa_executable_validate_alt( + hsa_executable_t executable, + const char *options, + uint32_t *result); + +/** + * @brief Executable symbol handle. + * + * The lifetime of an executable object symbol matches that of the executable + * associated with it. An operation on a symbol whose associated executable has + * been destroyed results in undefined behavior. + */ +typedef struct hsa_executable_symbol_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_executable_symbol_t; + +/** + * @deprecated Use ::hsa_executable_get_symbol_by_name instead. + * + * @brief Get the symbol handle for a given a symbol name. + * + * @param[in] executable Executable. + * + * @param[in] module_name Module name. Must be NULL if the symbol has + * program linkage. + * + * @param[in] symbol_name Symbol name. + * + * @param[in] agent Agent associated with the symbol. If the symbol is + * independent of any agent (for example, a variable with program + * allocation), this argument is ignored. + * + * @param[in] call_convention Call convention associated with the symbol. If the + * symbol does not correspond to an indirect function, this argument is ignored. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_get_symbol( + hsa_executable_t executable, + const char *module_name, + const char *symbol_name, + hsa_agent_t agent, + int32_t call_convention, + hsa_executable_symbol_t *symbol); + +/** + * @brief Retrieve the symbol handle corresponding to a given a symbol name. + * + * @param[in] executable Executable. + * + * @param[in] symbol_name Symbol name. Must be a NUL-terminated character + * array. The Programmer's Reference Manual describes the standard name mangling + * scheme. + * + * @param[in] agent Pointer to the agent for which the symbol with the given + * name is defined. If the symbol corresponding to the given name has program + * allocation, @p agent must be NULL. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or @p + * symbol is NULL. + */ +hsa_status_t HSA_API hsa_executable_get_symbol_by_name( + hsa_executable_t executable, + const char *symbol_name, + const hsa_agent_t *agent, + hsa_executable_symbol_t *symbol); + +/** + * @brief Symbol type. + */ +typedef enum { + /** + * Variable. + */ + HSA_SYMBOL_KIND_VARIABLE = 0, + /** + * Kernel. + */ + HSA_SYMBOL_KIND_KERNEL = 1, + /** + * Indirect function. + */ + HSA_SYMBOL_KIND_INDIRECT_FUNCTION = 2 +} hsa_symbol_kind_t; + +/** + * @brief Linkage type of a symbol. + */ +typedef enum { + /** + * Module linkage. + */ + HSA_SYMBOL_LINKAGE_MODULE = 0, + /** + * Program linkage. + */ + HSA_SYMBOL_LINKAGE_PROGRAM = 1 +} hsa_symbol_linkage_t; + +/** + * @brief Allocation type of a variable. + */ +typedef enum { + /** + * Agent allocation. + */ + HSA_VARIABLE_ALLOCATION_AGENT = 0, + /** + * Program allocation. + */ + HSA_VARIABLE_ALLOCATION_PROGRAM = 1 +} hsa_variable_allocation_t; + +/** + * @brief Memory segment associated with a variable. + */ +typedef enum { + /** + * Global memory segment. + */ + HSA_VARIABLE_SEGMENT_GLOBAL = 0, + /** + * Readonly memory segment. + */ + HSA_VARIABLE_SEGMENT_READONLY = 1 +} hsa_variable_segment_t; + +/** + * @brief Executable symbol attributes. + */ +typedef enum { + /** + * The kind of the symbol. The type of this attribute is ::hsa_symbol_kind_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0, + /** + * The length of the symbol name in bytes, not including the NUL terminator. + * The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1, + /** + * The name of the symbol. The type of this attribute is character array with + * the length equal to the value of ::HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH + * attribute. + */ + HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2, + /** + * @deprecated + * + * The length of the module name in bytes (not including the NUL terminator) + * to which this symbol belongs if this symbol has module linkage, otherwise 0 + * is returned. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3, + /** + * @deprecated + * + * The module name to which this symbol belongs if this symbol has module + * linkage, otherwise an empty string is returned. The type of this attribute + * is character array with the length equal to the value of + * ::HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute. + */ + HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME = 4, + /** + * @deprecated + * + * Agent associated with this symbol. If the symbol is a variable, the + * value of this attribute is only defined if + * ::HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION is + * ::HSA_VARIABLE_ALLOCATION_AGENT. The type of this attribute is hsa_agent_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_AGENT = 20, + /** + * The address of the variable. The value of this attribute is undefined if + * the symbol is not a variable. The type of this attribute is uint64_t. + * + * If executable's state is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 is + * returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21, + /** + * The linkage kind of the symbol. The type of this attribute is + * ::hsa_symbol_linkage_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE = 5, + /** + * Indicates whether the symbol corresponds to a definition. The type of this + * attribute is bool. + */ + HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION = 17, + /** + * @deprecated + * + * The allocation kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_allocation_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6, + /** + * @deprecated + * + * The segment kind of the variable. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_segment_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT = 7, + /** + * @deprecated + * + * Alignment of the symbol in memory. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is uint32_t. + * + * The current alignment of the variable in memory may be greater than the + * value specified in the source program variable declaration. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8, + /** + * @deprecated + * + * Size of the variable. The value of this attribute is undefined if + * the symbol is not a variable. The type of this attribute is uint32_t. + * + * A value of 0 is returned if the variable is an external variable and has an + * unknown dimension. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9, + /** + * @deprecated + * + * Indicates whether the variable is constant. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * bool. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST = 10, + /** + * Kernel object handle, used in the kernel dispatch packet. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint64_t. + * + * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 + * is returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22, + /** + * Size of kernarg segment memory that is required to hold the values of the + * kernel arguments, in bytes. Must be a multiple of 16. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, + /** + * Alignment (in bytes) of the buffer used to pass arguments to the kernel, + * which is the maximum of 16 and the maximum alignment of any of the kernel + * arguments. The value of this attribute is undefined if the symbol is not a + * kernel. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12, + /** + * Size of static group segment memory required by the kernel (per + * work-group), in bytes. The value of this attribute is undefined + * if the symbol is not a kernel. The type of this attribute is uint32_t. + * + * The reported amount does not include any dynamically allocated group + * segment memory that may be requested by the application when a kernel is + * dispatched. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, + /** + * Size of static private, spill, and arg segment memory required by + * this kernel (per work-item), in bytes. The value of this attribute is + * undefined if the symbol is not a kernel. The type of this attribute is + * uint32_t. + * + * If the value of ::HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is + * true, the kernel may use more private memory than the reported value, and + * the application must add the dynamic call stack usage to @a + * private_segment_size when populating a kernel dispatch packet. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, + /** + * Dynamic callstack flag. The value of this attribute is undefined if the + * symbol is not a kernel. The type of this attribute is bool. + * + * If this flag is set (the value is true), the kernel uses a dynamically + * sized call stack. This can happen if recursive calls, calls to indirect + * functions, or the HSAIL alloca instruction are present in the kernel. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, + /** + * @deprecated + * + * Call convention of the kernel. The value of this attribute is undefined if + * the symbol is not a kernel. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18, + /** + * Indirect function object handle. The value of this attribute is undefined + * if the symbol is not an indirect function, or the associated agent does + * not support the Full Profile. The type of this attribute depends on the + * machine model: the type is uint32_t for small machine model, and uint64_t + * for large model. + * + * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 + * is returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_OBJECT = 23, + /** + * @deprecated + * + * Call convention of the indirect function. The value of this attribute is + * undefined if the symbol is not an indirect function, or the associated + * agent does not support the Full Profile. The type of this attribute is + * uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16 +} hsa_executable_symbol_info_t; + +/** + * @brief Get the current value of an attribute for a given executable symbol. + * + * @param[in] executable_symbol Executable symbol. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL The executable symbol is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * executable symbol attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_executable_symbol_get_info( + hsa_executable_symbol_t executable_symbol, + hsa_executable_symbol_info_t attribute, + void *value); + +/** + * @deprecated + * + * @brief Iterate over the symbols in a executable, and invoke an + * application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_iterate_symbols( + hsa_executable_t executable, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_executable_symbol_t symbol, + void *data), + void *data); + +/** + * @brief Iterate over the kernels, indirect functions, and agent allocation + * variables in an executable for a given agent, and invoke an application- + * defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_executable_iterate_agent_symbols( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_agent_t agent, + hsa_executable_symbol_t symbol, + void *data), + void *data); + +/** + * @brief Iterate over the program allocation variables in an executable, and + * invoke an application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_executable_iterate_program_symbols( + hsa_executable_t executable, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_executable_symbol_t symbol, + void *data), + void *data); + +/** @} */ + + +/** \defgroup code-object Code Objects (deprecated). + * @{ + */ + +/** + * @deprecated + * + * @brief Struct containing an opaque handle to a code object, which contains + * ISA for finalized kernels and indirect functions together with information + * about the global or readonly segment variables they reference. + */ +typedef struct hsa_code_object_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_code_object_t; + +/** + * @deprecated + * + * @brief Application data handle that is passed to the serialization + * and deserialization functions. + */ +typedef struct hsa_callback_data_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_callback_data_t; + +/** + * @deprecated + * + * @brief Serialize a code object. Can be used for offline finalization, + * install-time finalization, disk code caching, etc. + * + * @param[in] code_object Code object. + * + * @param[in] alloc_callback Callback function for memory allocation. Must not + * be NULL. The HSA runtime passes three arguments to the callback: the + * allocation size, the application data, and a pointer to a memory location + * where the application stores the allocation result. The HSA runtime invokes + * @p alloc_callback once to allocate a buffer that contains the serialized + * version of @p code_object. If the callback returns a status code other than + * ::HSA_STATUS_SUCCESS, this function returns the same code. + * + * @param[in] callback_data Application data that is passed to @p + * alloc_callback. May be NULL. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] serialized_code_object Memory location where the HSA runtime + * stores a pointer to the serialized code object. Must not be NULL. + * + * @param[out] serialized_code_object_size Memory location where the HSA runtime + * stores the size (in bytes) of @p serialized_code_object. The returned value + * matches the allocation size passed by the HSA runtime to @p + * alloc_callback. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p alloc_callback, @p + * serialized_code_object, or @p serialized_code_object_size are NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_serialize( + hsa_code_object_t code_object, + hsa_status_t (*alloc_callback)(size_t size, + hsa_callback_data_t data, + void **address), + hsa_callback_data_t callback_data, + const char *options, + void **serialized_code_object, + size_t *serialized_code_object_size); + +/** + * @deprecated + * + * @brief Deserialize a code object. + * + * @param[in] serialized_code_object A serialized code object. Must not be NULL. + * + * @param[in] serialized_code_object_size The size (in bytes) of @p + * serialized_code_object. Must not be 0. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] code_object Memory location where the HSA runtime stores the + * deserialized code object. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p serialized_code_object, or @p + * code_object are NULL, or @p serialized_code_object_size is 0. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_deserialize( + void *serialized_code_object, + size_t serialized_code_object_size, + const char *options, + hsa_code_object_t *code_object); + +/** + * @deprecated + * + * @brief Destroy a code object. + * + * @details The lifetime of a code object must exceed that of any executable + * where it has been loaded. If an executable that loaded @p code_object has not + * been destroyed, the behavior is undefined. + * + * @param[in] code_object Code object. The handle becomes invalid after it has + * been destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_destroy( + hsa_code_object_t code_object); + +/** + * @deprecated + * + * @brief Code object type. + */ +typedef enum { + /** + * Produces code object that contains ISA for all kernels and indirect + * functions in HSA source. + */ + HSA_CODE_OBJECT_TYPE_PROGRAM = 0 +} hsa_code_object_type_t; + +/** + * @deprecated + * + * @brief Code object attributes. + */ +typedef enum { + /** + * The version of the code object. The type of this attribute is a + * NUL-terminated char[64]. The name must be at most 63 characters long (not + * including the NUL terminator) and all array elements not used for the name + * must be NUL. + */ + HSA_CODE_OBJECT_INFO_VERSION = 0, + /** + * Type of code object. The type of this attribute is + * ::hsa_code_object_type_t. + */ + HSA_CODE_OBJECT_INFO_TYPE = 1, + /** + * Instruction set architecture this code object is produced for. The type of + * this attribute is ::hsa_isa_t. + */ + HSA_CODE_OBJECT_INFO_ISA = 2, + /** + * Machine model this code object is produced for. The type of this attribute + * is ::hsa_machine_model_t. + */ + HSA_CODE_OBJECT_INFO_MACHINE_MODEL = 3, + /** + * Profile this code object is produced for. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_CODE_OBJECT_INFO_PROFILE = 4, + /** + * Default floating-point rounding mode used when the code object is + * produced. The type of this attribute is + * ::hsa_default_float_rounding_mode_t. + */ + HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5 +} hsa_code_object_info_t; + +/** + * @deprecated + * + * @brief Get the current value of an attribute for a given code object. + * + * @param[in] code_object Code object. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * code object attribute, or @p value is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_info( + hsa_code_object_t code_object, + hsa_code_object_info_t attribute, + void *value); + +/** + * @deprecated + * + * @brief Load code object into the executable. + * + * @details Every global or readonly variable that is external must be defined + * before loading the code object. An internal global or readonly variable is + * allocated once the code object, that is being loaded, references this + * variable and this variable is not allocated. + * + * Any module linkage declaration must have been defined either by a define + * variable or by loading a code object that has a symbol with module linkage + * definition. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent to load code object for. The agent must support the + * default floating-point rounding mode used by @p code_object. + * + * @param[in] code_object Code object to load. The lifetime of the code object + * must exceed that of the executable: if @p code_object is destroyed before @p + * executable, the behavior is undefined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p agent is not compatible + * with @p code_object (for example, @p agent does not support the default + * floating-point rounding mode specified by @p code_object), or @p code_object + * is not compatible with @p executable (for example, @p code_object and @p + * executable have different machine models or profiles). + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_load_code_object( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options); + +/** + * @deprecated + * + * @brief Code object symbol handle. + * + * The lifetime of a code object symbol matches that of the code object + * associated with it. An operation on a symbol whose associated code object has + * been destroyed results in undefined behavior. + */ +typedef struct hsa_code_symbol_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_code_symbol_t; + +/** + * @deprecated + * + * @brief Get the symbol handle within a code object for a given a symbol name. + * + * @param[in] code_object Code object. + * + * @param[in] symbol_name Symbol name. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol( + hsa_code_object_t code_object, + const char *symbol_name, + hsa_code_symbol_t *symbol); + +/** + * @deprecated + * + * @brief Get the symbol handle within a code object for a given a symbol name. + * + * @param[in] code_object Code object. + * + * @param[in] module_name Module name. Must be NULL if the symbol has + * program linkage. + * + * @param[in] symbol_name Symbol name. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol_from_name( + hsa_code_object_t code_object, + const char *module_name, + const char *symbol_name, + hsa_code_symbol_t *symbol); + +/** + * @deprecated + * + * @brief Code object symbol attributes. + */ +typedef enum { + /** + * The type of the symbol. The type of this attribute is ::hsa_symbol_kind_t. + */ + HSA_CODE_SYMBOL_INFO_TYPE = 0, + /** + * The length of the symbol name in bytes, not including the NUL terminator. + * The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_NAME_LENGTH = 1, + /** + * The name of the symbol. The type of this attribute is character array with + * the length equal to the value of ::HSA_CODE_SYMBOL_INFO_NAME_LENGTH + * attribute. + */ + HSA_CODE_SYMBOL_INFO_NAME = 2, + /** + * The length of the module name in bytes (not including the NUL terminator) + * to which this symbol belongs if this symbol has module linkage, otherwise 0 + * is returned. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3, + /** + * The module name to which this symbol belongs if this symbol has module + * linkage, otherwise an empty string is returned. The type of this attribute + * is character array with the length equal to the value of + * ::HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute. + */ + HSA_CODE_SYMBOL_INFO_MODULE_NAME = 4, + /** + * The linkage kind of the symbol. The type of this attribute is + * ::hsa_symbol_linkage_t. + */ + HSA_CODE_SYMBOL_INFO_LINKAGE = 5, + /** + * Indicates whether the symbol corresponds to a definition. The type of this + * attribute is bool. + */ + HSA_CODE_SYMBOL_INFO_IS_DEFINITION = 17, + /** + * The allocation kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_allocation_t. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6, + /** + * The segment kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_segment_t. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT = 7, + /** + * Alignment of the symbol in memory. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is uint32_t. + * + * The current alignment of the variable in memory may be greater than the + * value specified in the source program variable declaration. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8, + /** + * Size of the variable. The value of this attribute is undefined if the + * symbol is not a variable. The type of this attribute is uint32_t. + * + * A size of 0 is returned if the variable is an external variable and has an + * unknown dimension. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE = 9, + /** + * Indicates whether the variable is constant. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * bool. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST = 10, + /** + * Size of kernarg segment memory that is required to hold the values of the + * kernel arguments, in bytes. Must be a multiple of 16. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, + /** + * Alignment (in bytes) of the buffer used to pass arguments to the kernel, + * which is the maximum of 16 and the maximum alignment of any of the kernel + * arguments. The value of this attribute is undefined if the symbol is not a + * kernel. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12, + /** + * Size of static group segment memory required by the kernel (per + * work-group), in bytes. The value of this attribute is undefined + * if the symbol is not a kernel. The type of this attribute is uint32_t. + * + * The reported amount does not include any dynamically allocated group + * segment memory that may be requested by the application when a kernel is + * dispatched. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, + /** + * Size of static private, spill, and arg segment memory required by + * this kernel (per work-item), in bytes. The value of this attribute is + * undefined if the symbol is not a kernel. The type of this attribute is + * uint32_t. + * + * If the value of ::HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is true, + * the kernel may use more private memory than the reported value, and the + * application must add the dynamic call stack usage to @a + * private_segment_size when populating a kernel dispatch packet. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, + /** + * Dynamic callstack flag. The value of this attribute is undefined if the + * symbol is not a kernel. The type of this attribute is bool. + * + * If this flag is set (the value is true), the kernel uses a dynamically + * sized call stack. This can happen if recursive calls, calls to indirect + * functions, or the HSAIL alloca instruction are present in the kernel. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, + /** + * Call convention of the kernel. The value of this attribute is undefined if + * the symbol is not a kernel. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18, + /** + * Call convention of the indirect function. The value of this attribute is + * undefined if the symbol is not an indirect function. The type of this + * attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16, + /** + * Wavefront size used by the kernel. The value of this attribute is either + * 32 or 64. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE = 19 +} hsa_code_symbol_info_t; + +/** + * @deprecated + * + * @brief Get the current value of an attribute for a given code symbol. + * + * @param[in] code_symbol Code symbol. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_SYMBOL The code symbol is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * code symbol attribute, or @p value is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_symbol_get_info( + hsa_code_symbol_t code_symbol, + hsa_code_symbol_info_t attribute, + void *value); + +/** + * @deprecated + * + * @brief Iterate over the symbols in a code object, and invoke an + * application-defined callback on every iteration. + * + * @param[in] code_object Code object. + * + * @param[in] callback Callback to be invoked once per code object symbol. The + * HSA runtime passes three arguments to the callback: the code object, a + * symbol, and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_code_object_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_iterate_symbols( + hsa_code_object_t code_object, + hsa_status_t (*callback)(hsa_code_object_t code_object, + hsa_code_symbol_t symbol, + void *data), + void *data); + +/** @} */ + +#ifdef __cplusplus +} // end extern "C" block +#endif + +#endif // header guard diff --git a/hsa/hsa_amd_tool.h b/hsa/hsa_amd_tool.h new file mode 100644 index 0000000000..fa9cac804a --- /dev/null +++ b/hsa/hsa_amd_tool.h @@ -0,0 +1,91 @@ +#ifndef HSA_RUNTIME_AMD_TOOL_EVENTS_H_ +#define HSA_RUNTIME_AMD_TOOL_EVENTS_H_ + +// Insert license header + +#include +#include +#include "hsa.h" + + +typedef enum { + HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE = 0, + HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_USE_ONCE = + (1 << 0), // This scratch allocation is only valid for 1 dispatch. + HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_ALT = + (1 << 1), // Used alternate scratch instead of main scratch +} hsa_amd_event_scratch_alloc_flag_t; + +typedef enum { + HSA_AMD_TOOL_EVENT_MIN = 0, + + // Scratch memory tracking + HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_START, + HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_END, + HSA_AMD_TOOL_EVENT_SCRATCH_FREE_START, + HSA_AMD_TOOL_EVENT_SCRATCH_FREE_END, + HSA_AMD_TOOL_EVENT_SCRATCH_ASYNC_RECLAIM_START, + HSA_AMD_TOOL_EVENT_SCRATCH_ASYNC_RECLAIM_END, + + // Add new events above ^ + HSA_AMD_TOOL_EVENT_MAX +} hsa_amd_tool_event_kind_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; +} hsa_amd_tool_event_none_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; + uint64_t dispatch_id; // Dispatch ID of the AQL packet that needs more scratch memory +} hsa_amd_event_scratch_alloc_start_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; + uint64_t dispatch_id; // Dispatch ID of the AQL packet that needs more scratch memory + size_t size; // Amount of scratch allocated - in bytes + size_t num_slots; // limit of number of waves +} hsa_amd_event_scratch_alloc_end_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; +} hsa_amd_event_scratch_free_start_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; +} hsa_amd_event_scratch_free_end_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; +} hsa_amd_event_scratch_async_reclaim_start_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; +} hsa_amd_event_scratch_async_reclaim_end_t; + +typedef union { + const hsa_amd_tool_event_none_t* none; + const hsa_amd_event_scratch_alloc_start_t* scratch_alloc_start; + const hsa_amd_event_scratch_alloc_end_t* scratch_alloc_end; + const hsa_amd_event_scratch_free_start_t* scratch_free_start; + const hsa_amd_event_scratch_free_end_t* scratch_free_end; + const hsa_amd_event_scratch_async_reclaim_start_t* scratch_async_reclaim_start; + const hsa_amd_event_scratch_async_reclaim_end_t* scratch_async_reclaim_end; +} hsa_amd_tool_event_t; + +typedef hsa_status_t (*hsa_amd_tool_event)(hsa_amd_tool_event_t); + + +#endif \ No newline at end of file diff --git a/hsa/hsa_api_trace.h b/hsa/hsa_api_trace.h new file mode 100644 index 0000000000..6515b19700 --- /dev/null +++ b/hsa/hsa_api_trace.h @@ -0,0 +1,585 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_H +#define HSA_RUNTIME_INC_HSA_API_TRACE_H + +#include "hsa.h" +#include "hsa_api_trace_version.h" +#ifdef AMD_INTERNAL_BUILD +#include "hsa_ext_image.h" +#include "hsa_ext_amd.h" +#include "hsa_ext_finalize.h" +#include "hsa_amd_tool.h" +#include "hsa_ven_amd_pc_sampling.h" +#else +#include "inc/hsa_ext_image.h" +#include "inc/hsa_ext_amd.h" +#include "inc/hsa_ext_finalize.h" +#include "inc/hsa_amd_tool.h" +#include "inc/hsa_ven_amd_pc_sampling.h" +#endif + +#include +#include +#include + +// Table MAJOR_VERSION and STEP_VERSION defines have moved to hsa_api_trace_version.h + +// Min function used to copy Api Tables +static inline uint32_t Min(const uint32_t a, const uint32_t b) { + return (a > b) ? b : a; +} + +// Declarations of APIs intended for use only by tools. + +// An AQL packet that can be put in an intercept queue to cause a callback to +// be invoked when the packet is about to be submitted to the underlying +// hardware queue. These packets are not copied to the underlying hardware +// queue. These packets should come immediately before the regular AQL packet +// they relate to. This implies that packet rewriters should always keep these +// packets adjacent to the regular AQL packet that follows them. +const uint32_t AMD_AQL_FORMAT_INTERCEPT_MARKER = 0xFE; + +struct amd_aql_intercept_marker_s; + +// When an intercept queue is processing rewritten packets to put them on the +// underlying hardware queue, if it encounters a +// AMD_AQL_FORMAT_INTERCEPT_MARKER vendor AQL packet it will call the following +// handler. packet points to the packet, queue is the underlying hardware +// queue, and packet_id is the packet id of the next packet to be put on the +// underlying hardware queue. The intercept queue does not put these packets +// onto the underlying hardware queue. +typedef void (*amd_intercept_marker_handler)(const struct amd_aql_intercept_marker_s* packet, + hsa_queue_t* queue, uint64_t packet_id); +// An AQL vendor packet used by the intercept queue to mark the following +// packet. The callback will be invoked to allow a tool to know where in the +// underlying hardware queue the following packet will be placed. user_data can +// be used to hold any data useful to the tool. +typedef struct amd_aql_intercept_marker_s { + uint16_t header; // Must have a packet type of HSA_PACKET_TYPE_VENDOR_SPECIFIC. + uint8_t format; // Must be AMD_AQL_FORMAT_INTERCEPT_MARKER. + uint8_t reserved[5]; // Must be 0. +#ifdef HSA_LARGE_MODEL + amd_intercept_marker_handler callback; +#elif defined HSA_LITTLE_ENDIAN + amd_intercept_marker_handler callback; + uint32_t reserved1; // Must be 0. +#else + uint32_t reserved1; // Must be 0. + amd_intercept_marker_handler callback; +#endif + uint64_t user_data[6]; +} amd_aql_intercept_marker_t; + +typedef void (*hsa_amd_queue_intercept_packet_writer)(const void* pkts, uint64_t pkt_count); +typedef void (*hsa_amd_queue_intercept_handler)(const void* pkts, uint64_t pkt_count, + uint64_t user_pkt_index, void* data, + hsa_amd_queue_intercept_packet_writer writer); +hsa_status_t hsa_amd_queue_intercept_register(hsa_queue_t* queue, + hsa_amd_queue_intercept_handler callback, + void* user_data); +hsa_status_t hsa_amd_queue_intercept_create( + hsa_agent_t agent_handle, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data, + uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue); + +typedef void (*hsa_amd_runtime_queue_notifier)(const hsa_queue_t* queue, hsa_agent_t agent, + void* data); +hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifier callback, + void* user_data); + +// Structure of Version used to identify an instance of Api table +// Must be the first member (offsetof == 0) of all API tables. +// This is the root of the table passing ABI. +struct ApiTableVersion { + uint32_t major_id; + uint32_t minor_id; + uint32_t step_id; + uint32_t reserved; +}; + +struct ToolsApiTable { + ApiTableVersion version; + + hsa_amd_tool_event hsa_amd_tool_scratch_event_alloc_start_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_alloc_end_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_free_start_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_free_end_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_async_reclaim_start_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_async_reclaim_end_fn; +}; + +// Table to export HSA Finalizer Extension Apis +struct FinalizerExtTable { + ApiTableVersion version; + decltype(hsa_ext_program_create)* hsa_ext_program_create_fn; + decltype(hsa_ext_program_destroy)* hsa_ext_program_destroy_fn; + decltype(hsa_ext_program_add_module)* hsa_ext_program_add_module_fn; + decltype(hsa_ext_program_iterate_modules)* hsa_ext_program_iterate_modules_fn; + decltype(hsa_ext_program_get_info)* hsa_ext_program_get_info_fn; + decltype(hsa_ext_program_finalize)* hsa_ext_program_finalize_fn; +}; + +// Table to export HSA Image Extension Apis +struct ImageExtTable { + ApiTableVersion version; + decltype(hsa_ext_image_get_capability)* hsa_ext_image_get_capability_fn; + decltype(hsa_ext_image_data_get_info)* hsa_ext_image_data_get_info_fn; + decltype(hsa_ext_image_create)* hsa_ext_image_create_fn; + decltype(hsa_ext_image_import)* hsa_ext_image_import_fn; + decltype(hsa_ext_image_export)* hsa_ext_image_export_fn; + decltype(hsa_ext_image_copy)* hsa_ext_image_copy_fn; + decltype(hsa_ext_image_clear)* hsa_ext_image_clear_fn; + decltype(hsa_ext_image_destroy)* hsa_ext_image_destroy_fn; + decltype(hsa_ext_sampler_create)* hsa_ext_sampler_create_fn; + decltype(hsa_ext_sampler_destroy)* hsa_ext_sampler_destroy_fn; + decltype(hsa_ext_image_get_capability_with_layout)* hsa_ext_image_get_capability_with_layout_fn; + decltype(hsa_ext_image_data_get_info_with_layout)* hsa_ext_image_data_get_info_with_layout_fn; + decltype(hsa_ext_image_create_with_layout)* hsa_ext_image_create_with_layout_fn; + decltype(hsa_ext_sampler_create_v2)* hsa_ext_sampler_create_v2_fn; + +}; + +// Table to export HSA PC Sampling Extension Apis +struct PcSamplingExtTable { + ApiTableVersion version; + decltype(hsa_ven_amd_pcs_iterate_configuration)* hsa_ven_amd_pcs_iterate_configuration_fn; + decltype(hsa_ven_amd_pcs_create)* hsa_ven_amd_pcs_create_fn; + decltype(hsa_ven_amd_pcs_create_from_id)* hsa_ven_amd_pcs_create_from_id_fn; + decltype(hsa_ven_amd_pcs_destroy)* hsa_ven_amd_pcs_destroy_fn; + decltype(hsa_ven_amd_pcs_start)* hsa_ven_amd_pcs_start_fn; + decltype(hsa_ven_amd_pcs_stop)* hsa_ven_amd_pcs_stop_fn; + decltype(hsa_ven_amd_pcs_flush)* hsa_ven_amd_pcs_flush_fn; +}; + + +// Table to export AMD Extension Apis +struct AmdExtTable { + ApiTableVersion version; + decltype(hsa_amd_coherency_get_type)* hsa_amd_coherency_get_type_fn; + decltype(hsa_amd_coherency_set_type)* hsa_amd_coherency_set_type_fn; + decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled_fn; + decltype(hsa_amd_profiling_async_copy_enable) *hsa_amd_profiling_async_copy_enable_fn; + decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time_fn; + decltype(hsa_amd_profiling_get_async_copy_time) *hsa_amd_profiling_get_async_copy_time_fn; + decltype(hsa_amd_profiling_convert_tick_to_system_domain)* hsa_amd_profiling_convert_tick_to_system_domain_fn; + decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler_fn; + decltype(hsa_amd_async_function)* hsa_amd_async_function_fn; + decltype(hsa_amd_signal_wait_any)* hsa_amd_signal_wait_any_fn; + decltype(hsa_amd_queue_cu_set_mask)* hsa_amd_queue_cu_set_mask_fn; + decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn; + decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn; + decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn; + decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; + decltype(hsa_amd_memory_async_copy_on_engine)* hsa_amd_memory_async_copy_on_engine_fn; + decltype(hsa_amd_memory_copy_engine_status)* hsa_amd_memory_copy_engine_status_fn; + decltype(hsa_amd_agent_memory_pool_get_info)* hsa_amd_agent_memory_pool_get_info_fn; + decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn; + decltype(hsa_amd_memory_pool_can_migrate)* hsa_amd_memory_pool_can_migrate_fn; + decltype(hsa_amd_memory_migrate)* hsa_amd_memory_migrate_fn; + decltype(hsa_amd_memory_lock)* hsa_amd_memory_lock_fn; + decltype(hsa_amd_memory_unlock)* hsa_amd_memory_unlock_fn; + decltype(hsa_amd_memory_fill)* hsa_amd_memory_fill_fn; + decltype(hsa_amd_interop_map_buffer)* hsa_amd_interop_map_buffer_fn; + decltype(hsa_amd_interop_unmap_buffer)* hsa_amd_interop_unmap_buffer_fn; + decltype(hsa_amd_image_create)* hsa_amd_image_create_fn; + decltype(hsa_amd_pointer_info)* hsa_amd_pointer_info_fn; + decltype(hsa_amd_pointer_info_set_userdata)* hsa_amd_pointer_info_set_userdata_fn; + decltype(hsa_amd_ipc_memory_create)* hsa_amd_ipc_memory_create_fn; + decltype(hsa_amd_ipc_memory_attach)* hsa_amd_ipc_memory_attach_fn; + decltype(hsa_amd_ipc_memory_detach)* hsa_amd_ipc_memory_detach_fn; + decltype(hsa_amd_signal_create)* hsa_amd_signal_create_fn; + decltype(hsa_amd_ipc_signal_create)* hsa_amd_ipc_signal_create_fn; + decltype(hsa_amd_ipc_signal_attach)* hsa_amd_ipc_signal_attach_fn; + decltype(hsa_amd_register_system_event_handler)* hsa_amd_register_system_event_handler_fn; + decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; + decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; + decltype(hsa_amd_queue_set_priority)* hsa_amd_queue_set_priority_fn; + decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; + decltype(hsa_amd_runtime_queue_create_register)* hsa_amd_runtime_queue_create_register_fn; + decltype(hsa_amd_memory_lock_to_pool)* hsa_amd_memory_lock_to_pool_fn; + decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn; + decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn; + decltype(hsa_amd_signal_value_pointer)* hsa_amd_signal_value_pointer_fn; + decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn; + decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn; + decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn; + decltype(hsa_amd_spm_acquire)* hsa_amd_spm_acquire_fn; + decltype(hsa_amd_spm_release)* hsa_amd_spm_release_fn; + decltype(hsa_amd_spm_set_dest_buffer)* hsa_amd_spm_set_dest_buffer_fn; + decltype(hsa_amd_queue_cu_get_mask)* hsa_amd_queue_cu_get_mask_fn; + decltype(hsa_amd_portable_export_dmabuf)* hsa_amd_portable_export_dmabuf_fn; + decltype(hsa_amd_portable_close_dmabuf)* hsa_amd_portable_close_dmabuf_fn; + decltype(hsa_amd_vmem_address_reserve)* hsa_amd_vmem_address_reserve_fn; + decltype(hsa_amd_vmem_address_free)* hsa_amd_vmem_address_free_fn; + decltype(hsa_amd_vmem_handle_create)* hsa_amd_vmem_handle_create_fn; + decltype(hsa_amd_vmem_handle_release)* hsa_amd_vmem_handle_release_fn; + decltype(hsa_amd_vmem_map)* hsa_amd_vmem_map_fn; + decltype(hsa_amd_vmem_unmap)* hsa_amd_vmem_unmap_fn; + decltype(hsa_amd_vmem_set_access)* hsa_amd_vmem_set_access_fn; + decltype(hsa_amd_vmem_get_access)* hsa_amd_vmem_get_access_fn; + decltype(hsa_amd_vmem_export_shareable_handle)* hsa_amd_vmem_export_shareable_handle_fn; + decltype(hsa_amd_vmem_import_shareable_handle)* hsa_amd_vmem_import_shareable_handle_fn; + decltype(hsa_amd_vmem_retain_alloc_handle)* hsa_amd_vmem_retain_alloc_handle_fn; + decltype(hsa_amd_vmem_get_alloc_properties_from_handle)* + hsa_amd_vmem_get_alloc_properties_from_handle_fn; + decltype(hsa_amd_agent_set_async_scratch_limit)* hsa_amd_agent_set_async_scratch_limit_fn; + decltype(hsa_amd_queue_get_info)* hsa_amd_queue_get_info_fn; + decltype(hsa_amd_vmem_address_reserve_align)* hsa_amd_vmem_address_reserve_align_fn; + decltype(hsa_amd_enable_logging)* hsa_amd_enable_logging_fn; + decltype(hsa_amd_signal_wait_all)* hsa_amd_signal_wait_all_fn; + decltype(hsa_amd_memory_get_preferred_copy_engine)* hsa_amd_memory_get_preferred_copy_engine_fn; + decltype(hsa_amd_portable_export_dmabuf_v2)* hsa_amd_portable_export_dmabuf_v2_fn; +}; + +// Table to export HSA Core Runtime Apis +struct CoreApiTable { + ApiTableVersion version; + decltype(hsa_init)* hsa_init_fn; + decltype(hsa_shut_down)* hsa_shut_down_fn; + decltype(hsa_system_get_info)* hsa_system_get_info_fn; + decltype(hsa_system_extension_supported)* hsa_system_extension_supported_fn; + decltype(hsa_system_get_extension_table)* hsa_system_get_extension_table_fn; + decltype(hsa_iterate_agents)* hsa_iterate_agents_fn; + decltype(hsa_agent_get_info)* hsa_agent_get_info_fn; + decltype(hsa_queue_create)* hsa_queue_create_fn; + decltype(hsa_soft_queue_create)* hsa_soft_queue_create_fn; + decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + decltype(hsa_queue_inactivate)* hsa_queue_inactivate_fn; + decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn; + decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn; + decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; + decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; + decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn; + decltype(hsa_queue_cas_write_index_scacq_screl)* hsa_queue_cas_write_index_scacq_screl_fn; + decltype(hsa_queue_cas_write_index_scacquire)* hsa_queue_cas_write_index_scacquire_fn; + decltype(hsa_queue_cas_write_index_relaxed)* hsa_queue_cas_write_index_relaxed_fn; + decltype(hsa_queue_cas_write_index_screlease)* hsa_queue_cas_write_index_screlease_fn; + decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl_fn; + decltype(hsa_queue_add_write_index_scacquire)* hsa_queue_add_write_index_scacquire_fn; + decltype(hsa_queue_add_write_index_relaxed)* hsa_queue_add_write_index_relaxed_fn; + decltype(hsa_queue_add_write_index_screlease)* hsa_queue_add_write_index_screlease_fn; + decltype(hsa_queue_store_read_index_relaxed)* hsa_queue_store_read_index_relaxed_fn; + decltype(hsa_queue_store_read_index_screlease)* hsa_queue_store_read_index_screlease_fn; + decltype(hsa_agent_iterate_regions)* hsa_agent_iterate_regions_fn; + decltype(hsa_region_get_info)* hsa_region_get_info_fn; + decltype(hsa_agent_get_exception_policies)* hsa_agent_get_exception_policies_fn; + decltype(hsa_agent_extension_supported)* hsa_agent_extension_supported_fn; + decltype(hsa_memory_register)* hsa_memory_register_fn; + decltype(hsa_memory_deregister)* hsa_memory_deregister_fn; + decltype(hsa_memory_allocate)* hsa_memory_allocate_fn; + decltype(hsa_memory_free)* hsa_memory_free_fn; + decltype(hsa_memory_copy)* hsa_memory_copy_fn; + decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn; + decltype(hsa_signal_create)* hsa_signal_create_fn; + decltype(hsa_signal_destroy)* hsa_signal_destroy_fn; + decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed_fn; + decltype(hsa_signal_load_scacquire)* hsa_signal_load_scacquire_fn; + decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn; + decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease_fn; + decltype(hsa_signal_wait_relaxed)* hsa_signal_wait_relaxed_fn; + decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire_fn; + decltype(hsa_signal_and_relaxed)* hsa_signal_and_relaxed_fn; + decltype(hsa_signal_and_scacquire)* hsa_signal_and_scacquire_fn; + decltype(hsa_signal_and_screlease)* hsa_signal_and_screlease_fn; + decltype(hsa_signal_and_scacq_screl)* hsa_signal_and_scacq_screl_fn; + decltype(hsa_signal_or_relaxed)* hsa_signal_or_relaxed_fn; + decltype(hsa_signal_or_scacquire)* hsa_signal_or_scacquire_fn; + decltype(hsa_signal_or_screlease)* hsa_signal_or_screlease_fn; + decltype(hsa_signal_or_scacq_screl)* hsa_signal_or_scacq_screl_fn; + decltype(hsa_signal_xor_relaxed)* hsa_signal_xor_relaxed_fn; + decltype(hsa_signal_xor_scacquire)* hsa_signal_xor_scacquire_fn; + decltype(hsa_signal_xor_screlease)* hsa_signal_xor_screlease_fn; + decltype(hsa_signal_xor_scacq_screl)* hsa_signal_xor_scacq_screl_fn; + decltype(hsa_signal_exchange_relaxed)* hsa_signal_exchange_relaxed_fn; + decltype(hsa_signal_exchange_scacquire)* hsa_signal_exchange_scacquire_fn; + decltype(hsa_signal_exchange_screlease)* hsa_signal_exchange_screlease_fn; + decltype(hsa_signal_exchange_scacq_screl)* hsa_signal_exchange_scacq_screl_fn; + decltype(hsa_signal_add_relaxed)* hsa_signal_add_relaxed_fn; + decltype(hsa_signal_add_scacquire)* hsa_signal_add_scacquire_fn; + decltype(hsa_signal_add_screlease)* hsa_signal_add_screlease_fn; + decltype(hsa_signal_add_scacq_screl)* hsa_signal_add_scacq_screl_fn; + decltype(hsa_signal_subtract_relaxed)* hsa_signal_subtract_relaxed_fn; + decltype(hsa_signal_subtract_scacquire)* hsa_signal_subtract_scacquire_fn; + decltype(hsa_signal_subtract_screlease)* hsa_signal_subtract_screlease_fn; + decltype(hsa_signal_subtract_scacq_screl)* hsa_signal_subtract_scacq_screl_fn; + decltype(hsa_signal_cas_relaxed)* hsa_signal_cas_relaxed_fn; + decltype(hsa_signal_cas_scacquire)* hsa_signal_cas_scacquire_fn; + decltype(hsa_signal_cas_screlease)* hsa_signal_cas_screlease_fn; + decltype(hsa_signal_cas_scacq_screl)* hsa_signal_cas_scacq_screl_fn; + + //===--- Instruction Set Architecture -----------------------------------===// + + decltype(hsa_isa_from_name)* hsa_isa_from_name_fn; + // Deprecated since v1.1. + decltype(hsa_isa_get_info)* hsa_isa_get_info_fn; + // Deprecated since v1.1. + decltype(hsa_isa_compatible)* hsa_isa_compatible_fn; + + //===--- Code Objects (deprecated) --------------------------------------===// + + // Deprecated since v1.1. + decltype(hsa_code_object_serialize)* hsa_code_object_serialize_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_deserialize)* hsa_code_object_deserialize_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_destroy)* hsa_code_object_destroy_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_get_info)* hsa_code_object_get_info_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_get_symbol)* hsa_code_object_get_symbol_fn; + // Deprecated since v1.1. + decltype(hsa_code_symbol_get_info)* hsa_code_symbol_get_info_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_iterate_symbols)* hsa_code_object_iterate_symbols_fn; + + //===--- Executable -----------------------------------------------------===// + + // Deprecated since v1.1. + decltype(hsa_executable_create)* hsa_executable_create_fn; + decltype(hsa_executable_destroy)* hsa_executable_destroy_fn; + // Deprecated since v1.1. + decltype(hsa_executable_load_code_object)* hsa_executable_load_code_object_fn; + decltype(hsa_executable_freeze)* hsa_executable_freeze_fn; + decltype(hsa_executable_get_info)* hsa_executable_get_info_fn; + decltype(hsa_executable_global_variable_define)* + hsa_executable_global_variable_define_fn; + decltype(hsa_executable_agent_global_variable_define)* + hsa_executable_agent_global_variable_define_fn; + decltype(hsa_executable_readonly_variable_define)* + hsa_executable_readonly_variable_define_fn; + decltype(hsa_executable_validate)* hsa_executable_validate_fn; + // Deprecated since v1.1. + decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol_fn; + decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info_fn; + // Deprecated since v1.1. + decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols_fn; + + //===--- Runtime Notifications ------------------------------------------===// + + decltype(hsa_status_string)* hsa_status_string_fn; + + // Start HSA v1.1 additions + decltype(hsa_extension_get_name)* hsa_extension_get_name_fn; + decltype(hsa_system_major_extension_supported)* hsa_system_major_extension_supported_fn; + decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table_fn; + decltype(hsa_agent_major_extension_supported)* hsa_agent_major_extension_supported_fn; + decltype(hsa_cache_get_info)* hsa_cache_get_info_fn; + decltype(hsa_agent_iterate_caches)* hsa_agent_iterate_caches_fn; + decltype(hsa_signal_silent_store_relaxed)* hsa_signal_silent_store_relaxed_fn; + decltype(hsa_signal_silent_store_screlease)* hsa_signal_silent_store_screlease_fn; + decltype(hsa_signal_group_create)* hsa_signal_group_create_fn; + decltype(hsa_signal_group_destroy)* hsa_signal_group_destroy_fn; + decltype(hsa_signal_group_wait_any_scacquire)* hsa_signal_group_wait_any_scacquire_fn; + decltype(hsa_signal_group_wait_any_relaxed)* hsa_signal_group_wait_any_relaxed_fn; + + //===--- Instruction Set Architecture - HSA v1.1 additions --------------===// + + decltype(hsa_agent_iterate_isas)* hsa_agent_iterate_isas_fn; + decltype(hsa_isa_get_info_alt)* hsa_isa_get_info_alt_fn; + decltype(hsa_isa_get_exception_policies)* hsa_isa_get_exception_policies_fn; + decltype(hsa_isa_get_round_method)* hsa_isa_get_round_method_fn; + decltype(hsa_wavefront_get_info)* hsa_wavefront_get_info_fn; + decltype(hsa_isa_iterate_wavefronts)* hsa_isa_iterate_wavefronts_fn; + + //===--- Code Objects (deprecated) - HSA v1.1 additions -----------------===// + + // Deprecated since v1.1. + decltype(hsa_code_object_get_symbol_from_name)* + hsa_code_object_get_symbol_from_name_fn; + + //===--- Executable - HSA v1.1 additions --------------------------------===// + + decltype(hsa_code_object_reader_create_from_file)* + hsa_code_object_reader_create_from_file_fn; + decltype(hsa_code_object_reader_create_from_memory)* + hsa_code_object_reader_create_from_memory_fn; + decltype(hsa_code_object_reader_destroy)* hsa_code_object_reader_destroy_fn; + decltype(hsa_executable_create_alt)* hsa_executable_create_alt_fn; + decltype(hsa_executable_load_program_code_object)* + hsa_executable_load_program_code_object_fn; + decltype(hsa_executable_load_agent_code_object)* + hsa_executable_load_agent_code_object_fn; + decltype(hsa_executable_validate_alt)* hsa_executable_validate_alt_fn; + decltype(hsa_executable_get_symbol_by_name)* + hsa_executable_get_symbol_by_name_fn; + decltype(hsa_executable_iterate_agent_symbols)* + hsa_executable_iterate_agent_symbols_fn; + decltype(hsa_executable_iterate_program_symbols)* + hsa_executable_iterate_program_symbols_fn; +}; + +// Table to export HSA Apis from Core Runtime, Amd Extensions +// Finalizer and Images +struct HsaApiTable { + + // Version of Hsa Api Table + ApiTableVersion version; + + // Table of function pointers to HSA Core Runtime + CoreApiTable* core_; + + // Table of function pointers to AMD extensions + AmdExtTable* amd_ext_; + + // Table of function pointers to HSA Finalizer Extension + FinalizerExtTable* finalizer_ext_; + + // Table of function pointers to HSA Image Extension + ImageExtTable* image_ext_; + + // Table of function pointers for tools to use + ToolsApiTable* tools_; + + // Table of function pointers to AMD PC Sampling Extension + PcSamplingExtTable* pc_sampling_ext_; +}; + +// Structure containing instances of different api tables +struct HsaApiTableContainer { + HsaApiTable root; + CoreApiTable core; + AmdExtTable amd_ext; + FinalizerExtTable finalizer_ext; + ImageExtTable image_ext; + ToolsApiTable tools; + PcSamplingExtTable pc_sampling_ext; + + // Default initialization of a container instance + HsaApiTableContainer() { + root.version.major_id = HSA_API_TABLE_MAJOR_VERSION; + root.version.minor_id = sizeof(HsaApiTable); + root.version.step_id = HSA_API_TABLE_STEP_VERSION; + + core.version.major_id = HSA_CORE_API_TABLE_MAJOR_VERSION; + core.version.minor_id = sizeof(CoreApiTable); + core.version.step_id = HSA_CORE_API_TABLE_STEP_VERSION; + root.core_ = &core; + + amd_ext.version.major_id = HSA_AMD_EXT_API_TABLE_MAJOR_VERSION; + amd_ext.version.minor_id = sizeof(AmdExtTable); + amd_ext.version.step_id = HSA_AMD_EXT_API_TABLE_STEP_VERSION; + root.amd_ext_ = &amd_ext; + + finalizer_ext.version.major_id = HSA_FINALIZER_API_TABLE_MAJOR_VERSION; + finalizer_ext.version.minor_id = sizeof(FinalizerExtTable); + finalizer_ext.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION; + root.finalizer_ext_ = &finalizer_ext; + + image_ext.version.major_id = HSA_IMAGE_API_TABLE_MAJOR_VERSION; + image_ext.version.minor_id = sizeof(ImageExtTable); + image_ext.version.step_id = HSA_IMAGE_API_TABLE_STEP_VERSION; + root.image_ext_ = &image_ext; + + tools.version.major_id = HSA_TOOLS_API_TABLE_MAJOR_VERSION; + tools.version.minor_id = sizeof(ToolsApiTable); + tools.version.step_id = HSA_TOOLS_API_TABLE_STEP_VERSION; + root.tools_ = &tools; + + pc_sampling_ext.version.major_id = HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION; + pc_sampling_ext.version.minor_id = sizeof(PcSamplingExtTable); + pc_sampling_ext.version.step_id = HSA_PC_SAMPLING_API_TABLE_STEP_VERSION; + root.pc_sampling_ext_ = &pc_sampling_ext; + } +}; + +// Api to copy function pointers of a table +static +void inline copyApi(void* src, void* dest, size_t size) { + assert(size >= sizeof(ApiTableVersion)); + memcpy((char*)src + sizeof(ApiTableVersion), + (char*)dest + sizeof(ApiTableVersion), + (size - sizeof(ApiTableVersion))); +} + +// Copy Api child tables if valid. +static void inline copyElement(ApiTableVersion* dest, ApiTableVersion* src) { + if (src->major_id && (dest->major_id == src->major_id)) { + dest->step_id = src->step_id; + dest->minor_id = Min(dest->minor_id, src->minor_id); + copyApi(dest, src, dest->minor_id); + } else { + dest->major_id = 0; + dest->minor_id = 0; + dest->step_id = 0; + } +} + +// Copy constructor for all Api tables. The function assumes the +// user has initialized an instance of tables container correctly +// for the Major, Minor and Stepping Ids of Root and Child Api tables. +// The function will overwrite the value of Minor Id by taking the +// minimum of source and destination parameters. It will also overwrite +// the stepping Id with value from source parameter. +static void inline copyTables(const HsaApiTable* src, HsaApiTable* dest) { + // Verify Major Id of source and destination tables match + if (dest->version.major_id != src->version.major_id) { + dest->version.major_id = 0; + dest->version.minor_id = 0; + dest->version.step_id = 0; + return; + } + + // Initialize the stepping id and minor id of root table. For the + // minor id which encodes struct size, take the minimum of source + // and destination parameters + dest->version.step_id = src->version.step_id; + dest->version.minor_id = Min(dest->version.minor_id, src->version.minor_id); + + // Copy child tables if present + if ((offsetof(HsaApiTable, core_) < dest->version.minor_id)) + copyElement(&dest->core_->version, &src->core_->version); + if ((offsetof(HsaApiTable, amd_ext_) < dest->version.minor_id)) + copyElement(&dest->amd_ext_->version, &src->amd_ext_->version); + if ((offsetof(HsaApiTable, finalizer_ext_) < dest->version.minor_id)) + copyElement(&dest->finalizer_ext_->version, &src->finalizer_ext_->version); + if ((offsetof(HsaApiTable, image_ext_) < dest->version.minor_id)) + copyElement(&dest->image_ext_->version, &src->image_ext_->version); + if ((offsetof(HsaApiTable, tools_) < dest->version.minor_id)) + copyElement(&dest->tools_->version, &src->tools_->version); + if ((offsetof(HsaApiTable, pc_sampling_ext_) < dest->version.minor_id)) + copyElement(&dest->pc_sampling_ext_->version, &src->pc_sampling_ext_->version); +} +#endif diff --git a/hsa/hsa_api_trace_version.h b/hsa/hsa_api_trace_version.h new file mode 100644 index 0000000000..befd1e26e3 --- /dev/null +++ b/hsa/hsa_api_trace_version.h @@ -0,0 +1,70 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H +#define HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H + +// CODE IN THIS FILE **MUST** BE C-COMPATIBLE + +// Major Ids of the Api tables exported by Hsa Core Runtime +#define HSA_API_TABLE_MAJOR_VERSION 0x03 +#define HSA_CORE_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_IMAGE_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_TOOLS_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION 0x01 + +// Step Ids of the Api tables exported by Hsa Core Runtime +#define HSA_API_TABLE_STEP_VERSION 0x01 +#define HSA_CORE_API_TABLE_STEP_VERSION 0x00 +#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x07 +#define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00 +#define HSA_IMAGE_API_TABLE_STEP_VERSION 0x01 +// Rocprofiler just checks HSA_MAGE_EXT_API_TABLE_STEP_VERSION +#define HSA_IMAGE_EXT_API_TABLE_STEP_VERSION HSA_IMAGE_API_TABLE_STEP_VERSION +#define HSA_AQLPROFILE_API_TABLE_STEP_VERSION 0x00 +#define HSA_TOOLS_API_TABLE_STEP_VERSION 0x00 +#define HSA_PC_SAMPLING_API_TABLE_STEP_VERSION 0x00 + +#endif // HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H diff --git a/hsa/hsa_ext_amd.h b/hsa/hsa_ext_amd.h new file mode 100644 index 0000000000..c402a51264 --- /dev/null +++ b/hsa/hsa_ext_amd.h @@ -0,0 +1,3675 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA AMD extension. + +#ifndef HSA_RUNTIME_EXT_AMD_H_ +#define HSA_RUNTIME_EXT_AMD_H_ + +#include "hsa.h" +#include "hsa_ext_image.h" +#include "hsa_ven_amd_pc_sampling.h" + +/** + * - 1.0 - initial version + * - 1.1 - dmabuf export + * - 1.2 - hsa_amd_memory_async_copy_on_engine + * - 1.3 - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED pool + * - 1.4 - Virtual Memory API + * - 1.5 - hsa_amd_agent_info: HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES + * - 1.6 - Virtual Memory API: hsa_amd_vmem_address_reserve_align + * - 1.7 - hsa_amd_signal_wait_all + * - 1.8 - hsa_amd_memory_get_preferred_copy_engine + * - 1.9 - hsa_amd_portable_export_dmabuf_v2 + * - 1.10 - hsa_amd_vmem_address_reserve: HSA_AMD_VMEM_ADDRESS_NO_REGISTER + * - 1.11 - hsa_amd_agent_info_t: HSA_AMD_AGENT_INFO_CLOCK_COUNTERS + */ +#define HSA_AMD_INTERFACE_VERSION_MAJOR 1 +#define HSA_AMD_INTERFACE_VERSION_MINOR 11 + +#ifdef __cplusplus +extern "C" { +#endif + +/** \addtogroup aql Architected Queuing Language + * @{ + */ + +/** + * @brief Macro to set a flag within uint8_t[8] types. + */ +static inline void hsa_flag_set64(uint8_t* value, uint32_t bit) { + unsigned int index = bit / 8; + unsigned int subBit = bit % 8; + (((uint8_t*)value)[index]) |= (1 << subBit); +} + +/** + * @brief Macro to determine whether a flag is set within uint8_t[8] types. + */ +static inline bool hsa_flag_isset64(uint8_t* value, uint32_t bit) { + unsigned int index = bit / 8; + unsigned int subBit = bit % 8; + return ((uint8_t*)value)[index] & (1 << subBit); +} + +/** + * @brief A fixed-size type used to represent ::hsa_signal_condition_t constants. + */ +typedef uint32_t hsa_signal_condition32_t; + +/** + * @brief AMD vendor specific packet type. + */ +typedef enum { + /** + * Packet used by agents to delay processing of subsequent packets until a + * configurable condition is satisfied by an HSA signal. Only kernel dispatch + * queues created from AMD GPU Agents support this packet. + */ + HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2, + /** + * Packet used to send commands to an AIE agent's embedded runtime (ERT). The + * ERT is responsible for, among other things, handling dispatches. Only + * queues created on AIE agents support this packet. + */ + HSA_AMD_PACKET_TYPE_AIE_ERT = 3 +} hsa_amd_packet_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_amd_packet_type_t constants. + */ +typedef uint8_t hsa_amd_packet_type8_t; + +/** + * @brief AMD vendor specific AQL packet header + */ +typedef struct hsa_amd_packet_header_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Format of the vendor specific packet. + */ + hsa_amd_packet_type8_t AmdFormat; + + /** + * Reserved. Must be 0. + */ + uint8_t reserved; +} hsa_amd_vendor_packet_header_t; + +/** + * @brief AMD barrier value packet. Halts packet processing and waits for + * (signal_value & ::mask) ::cond ::value to be satisfied, where signal_value + * is the value of the signal ::signal. + */ +typedef struct hsa_amd_barrier_value_packet_s { + /** + * AMD vendor specific packet header. + */ + hsa_amd_vendor_packet_header_t header; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; + + /** + * Dependent signal object. A signal with a handle value of 0 is + * allowed and is interpreted by the packet processor a satisfied + * dependency. + */ + hsa_signal_t signal; + + /** + * Value to compare against. + */ + hsa_signal_value_t value; + + /** + * Bit mask to be combined by bitwise AND with ::signal's value. + */ + hsa_signal_value_t mask; + + /** + * Comparison operation. See ::hsa_signal_condition_t. + */ + hsa_signal_condition32_t cond; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved3; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; +} hsa_amd_barrier_value_packet_t; + +/** + * State of an AIE ERT command. + */ +typedef enum { + /** + * Set by the host before submitting a command to the scheduler. + */ + HSA_AMD_AIE_ERT_STATE_NEW = 1, + /** + * Internal scheduler state. + */ + HSA_AMD_AIE_ERT_STATE_QUEUED = 2, + /** + * Internal scheduler state. + */ + HSA_AMD_AIE_ERT_STATE_RUNNING = 3, + /** + * Set by the scheduler when a command completes. + */ + HSA_AMD_AIE_ERT_STATE_COMPLETED = 4, + /** + * Set by the scheduler if a command failed. + */ + HSA_AMD_AIE_ERT_STATE_ERROR = 5, + /** + * Set by the scheduler if a command aborted. + */ + HSA_AMD_AIE_ERT_STATE_ABORT = 6, + /** + * Internal scheduler state. + */ + HSA_AMD_AIE_ERT_STATE_SUBMITTED = 7, + /** + * Set by the scheduler on a timeout and reset. + */ + HSA_AMD_AIE_ERT_STATE_TIMEOUT = 8, + /** + * Set by the scheduler on a timeout and fail to reset. + */ + HSA_AMD_AIE_ERT_STATE_NORESPONSE = 9, + HSA_AMD_AIE_ERT_STATE_SKERROR = 10, + HSA_AMD_AIE_ERT_STATE_SKCRASHED = 11, + HSA_AMD_AIE_ERT_STATE_MAX +} hsa_amd_aie_ert_state; + +/** + * Opcode types for HSA AIE ERT commands. + */ +typedef enum { + /** + * Start a workgroup on a compute unit (CU). + */ + HSA_AMD_AIE_ERT_START_CU = 0, + /** + * Currently aliased to HSA_AMD_AIE_ERT_START_CU. + */ + HSA_AMD_AIE_ERT_START_KERNEL = 0, + /** + * Configure command scheduler. + */ + HSA_AMD_AIE_ERT_CONFIGURE = 2, + HSA_AMD_AIE_ERT_EXIT = 3, + HSA_AMD_AIE_ERT_ABORT = 4, + /** + * Execute a specified CU after writing. + */ + HSA_AMD_AIE_ERT_EXEC_WRITE = 5, + /** + * Get stats about a CU's execution. + */ + HSA_AMD_AIE_ERT_CU_STAT = 6, + /** + * Start KDMA CU or P2P. + */ + HSA_AMD_AIE_ERT_START_COPYBO = 7, + /** + * Configure a soft kernel. + */ + HSA_AMD_AIE_ERT_SK_CONFIG = 8, + /** + * Start a soft kernel. + */ + HSA_AMD_AIE_ERT_SK_START = 9, + /** + * Unconfigure a soft kernel. + */ + HSA_AMD_AIE_ERT_SK_UNCONFIG = 10, + /** + * Initialize a CU. + */ + HSA_AMD_AIE_ERT_INIT_CU = 11, + HSA_AMD_AIE_ERT_START_FA = 12, + HSA_AMD_AIE_ERT_CLK_CALIB = 13, + HSA_AMD_AIE_ERT_MB_VALIDATE = 14, + /** + * Same as HSA_AMD_AIE_ERT_START_CU but with a key-value pair. + */ + HSA_AMD_AIE_ERT_START_KEY_VAL = 15, + HSA_AMD_AIE_ERT_ACCESS_TEST_C = 16, + HSA_AMD_AIE_ERT_ACCESS_TEST = 17, + /** + * Instruction buffer command format. + */ + HSA_AMD_AIE_ERT_START_DPU = 18, + /** + * Command chain. + */ + HSA_AMD_AIE_ERT_CMD_CHAIN = 19, + /** + * Instruction buffer command format on NPU. + */ + HSA_AMD_AIE_ERT_START_NPU = 20, + /** + * Instruction buffer command with pre-emption format on the NPU. + */ + HSA_AMD_AIE_ERT_START_NPU_PREEMPT = 21 +} hsa_amd_aie_ert_cmd_opcode_t; + +/** + * Payload data for AIE ERT start kernel packets (i.e., when the opcode is + * HSA_AMD_AIE_ERT_START_KERNEL). + */ +typedef struct hsa_amd_aie_ert_start_kernel_data_s { + /** + * Address to the PDI. + */ + void* pdi_addr; + /** + * Opcode, instructions and kernel arguments. + */ + uint32_t data[]; +} hsa_amd_aie_ert_start_kernel_data_t; + +/** + * AMD AIE ERT packet. Used for sending a command to an AIE agent. + */ +typedef struct hsa_amd_aie_ert_packet_s { + /** + * AMD vendor specific packet header. + */ + hsa_amd_vendor_packet_header_t header; + /** + * Format for packets interpreted by the ERT to understand the command and + * payload data. + */ + struct { + /** + * Current state of a command. + */ + uint32_t state : 4; + /** + * Flexible field that can be interpreted on a per-command basis. + */ + uint32_t custom : 8; + /** + * Number of DWORDs in the payload data. + */ + uint32_t count : 11; + /** + * Opcode identifying the command. + */ + uint32_t opcode : 5; + /** + * Type of a command (currently 0). + */ + uint32_t type : 4; + }; + /** + * Reserved. Must be 0. + */ + uint64_t reserved0; + /** + * Reserved. Must be 0. + */ + uint64_t reserved1; + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + /** + * Reserved. Must be 0. + */ + uint64_t reserved3; + /** + * Reserved. Must be 0. + */ + uint64_t reserved4; + /** + * Reserved. Must be 0. + */ + uint64_t reserved5; + /** + * Address of packet data payload. ERT commands contain arbitrarily sized + * data payloads. + */ + uint64_t payload_data; +} hsa_amd_aie_ert_packet_t; + +/** @} */ + +/** \defgroup error-codes Error codes + * @{ + */ + +/** + * @brief Enumeration constants added to ::hsa_status_t. + * + * @remark Additions to hsa_status_t + */ +enum { + /** + * The memory pool is invalid. + */ + HSA_STATUS_ERROR_INVALID_MEMORY_POOL = 40, + + /** + * Agent accessed memory beyond the maximum legal address. + */ + HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION = 41, + + /** + * Agent executed an invalid shader instruction. + */ + HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION = 42, + + /** + * Agent attempted to access an inaccessible address. + * See hsa_amd_register_system_event_handler and + * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses. + */ + HSA_STATUS_ERROR_MEMORY_FAULT = 43, + + /** + * The CU mask was successfully set but the mask attempted to enable a CU + * which was disabled for the process. CUs disabled for the process remain + * disabled. + */ + HSA_STATUS_CU_MASK_REDUCED = 44, + + /** + * Exceeded number of VGPRs available on this agent + */ + HSA_STATUS_ERROR_OUT_OF_REGISTERS = 45, + + /** + * Resource is busy or temporarily unavailable + */ + HSA_STATUS_ERROR_RESOURCE_BUSY = 46, + + /** + * Request is not supported by this system + */ + HSA_STATUS_ERROR_NOT_SUPPORTED = 47, +}; + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief IOMMU version supported + */ +typedef enum { + /** + * IOMMU not supported + */ + HSA_IOMMU_SUPPORT_NONE = 0, + /* IOMMU V1 support is not relevant to user applications, so not reporting it */ + /** + * IOMMU V2 supported + */ + HSA_IOMMU_SUPPORT_V2 = 1, +} hsa_amd_iommu_version_t; + +/** + * @brief Structure containing information on the agent's clock counters. + */ +typedef struct hsa_amd_clock_counters_s { + uint64_t gpu_clock_counter; + uint64_t cpu_clock_counter; + uint64_t system_clock_counter; + uint64_t system_clock_frequency; +} hsa_amd_clock_counters_t; + +/** + * @brief Agent attributes. + */ +typedef enum hsa_amd_agent_info_s { + /** + * Chip identifier. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000, + /** + * Size of a cacheline in bytes. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001, + /** + * The number of compute unit available in the agent. The type of this + * attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002, + /** + * The maximum clock frequency of the agent in MHz. The type of this + * attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003, + /** + * Internal driver node identifier. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_DRIVER_NODE_ID = 0xA004, + /** + * Max number of watch points on memory address ranges to generate exception + * events when the watched addresses are accessed. The type of this + * attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS = 0xA005, + /** + * Agent BDF_ID, named LocationID in thunk. The type of this attribute is + * uint32_t. + */ + HSA_AMD_AGENT_INFO_BDFID = 0xA006, + /** + * Memory Interface width, the return value type is uint32_t. + * This attribute is deprecated. + */ + HSA_AMD_AGENT_INFO_MEMORY_WIDTH = 0xA007, + /** + * Max Memory Clock, the return value type is uint32_t. + */ + HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY = 0xA008, + /** + * Board name of Agent - populated from MarketingName of Kfd Node + * The value is an Ascii string of 64 chars. + */ + HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009, + /** + * Maximum number of waves possible in a Compute Unit. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A, + /** + * Number of SIMD's per compute unit CU + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B, + /** + * Number of Shader Engines (SE) in Gpu + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES = 0xA00C, + /** + * Number of Shader Arrays Per Shader Engines in Gpu + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE = 0xA00D, + /** + * Address of the HDP flush registers. Use of these registers does not conform to the HSA memory + * model and should be treated with caution. + * The type of this attribute is hsa_amd_hdp_flush_t. + */ + HSA_AMD_AGENT_INFO_HDP_FLUSH = 0xA00E, + /** + * PCIe domain for the agent. Pairs with HSA_AMD_AGENT_INFO_BDFID + * to give the full physical location of the Agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_DOMAIN = 0xA00F, + /** + * Queries for support of cooperative queues. See ::HSA_QUEUE_TYPE_COOPERATIVE. + * The type of this attribute is bool. + */ + HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010, + /** + * Queries UUID of an agent. The value is an Ascii string with a maximum + * of 21 chars including NUL. The string value consists of two parts: header + * and body. The header identifies device type (GPU, CPU, DSP) while body + * encodes UUID as a 16 digit hex string + * + * Agents that do not support UUID will return the string "GPU-XX" or + * "CPU-XX" or "DSP-XX" depending upon their device type ::hsa_device_type_t + */ + HSA_AMD_AGENT_INFO_UUID = 0xA011, + /** + * Queries for the ASIC revision of an agent. The value is an integer that + * increments for each revision. This can be used by user-level software to + * change how it operates, depending on the hardware version. This allows + * selective workarounds for hardware errata. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_ASIC_REVISION = 0xA012, + /** + * Queries whether or not the host can directly access SVM memory that is + * physically resident in the agent's local memory. + * The type of this attribute is bool. + */ + HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS = 0xA013, + /** + * Some processors support more CUs than can reliably be used in a cooperative + * dispatch. This queries the count of CUs which are fully enabled for + * cooperative dispatch. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT = 0xA014, + /** + * Queries the amount of memory available in bytes accross all global pools + * owned by the agent. + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_MEMORY_AVAIL = 0xA015, + /** + * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is + * in the range 1-400MHz. + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016, + /** + * Queries for the ASIC family ID of an agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_ASIC_FAMILY_ID = 0xA107, + /** + * Queries for the Packet Processor(CP Firmware) ucode version of an agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_UCODE_VERSION = 0xA108, + /** + * Queries for the SDMA engine ucode of an agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_SDMA_UCODE_VERSION = 0xA109, + /** + * Queries the number of SDMA engines. + * If HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG query returns non-zero, + * this query returns the the number of SDMA engines optimized for + * host to device bidirectional traffic. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SDMA_ENG = 0xA10A, + /** + * Queries the number of additional SDMA engines optimized for D2D xGMI copies. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG = 0xA10B, + /** + * Queries for version of IOMMU supported by agent. + * The type of this attribute is hsa_amd_iommu_version_t. + */ + HSA_AMD_AGENT_INFO_IOMMU_SUPPORT = 0xA110, + /** + * Queries for number of XCCs within the agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_XCC = 0xA111, + /** + * Queries for driver unique identifier. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_DRIVER_UID = 0xA112, + /** + * Returns the hsa_agent_t of the nearest CPU agent + * The type of this attribute is hsa_agent_t. + */ + HSA_AMD_AGENT_INFO_NEAREST_CPU = 0xA113, + /** + * Bit-mask indicating memory properties of this agent. A memory property is set if the flag bit + * is set at that position. User may use the hsa_flag_isset64 macro to verify whether a flag + * is set. The type of this attribute is uint8_t[8]. + */ + HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES = 0xA114, + /** + * Bit-mask indicating AQL Extensions supported by this agent. An AQL extension is set if the flag + * bit is set at that position. User may use the hsa_flag_isset64 macro to verify whether a flag + * is set. The type of this attribute is uint8_t[8]. + */ + HSA_AMD_AGENT_INFO_AQL_EXTENSIONS = 0xA115, /* Not implemented yet */ + /** + * Maximum allowed value in bytes for scratch limit for this agent. This amount + * is shared accross all queues created on this agent. + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_MAX = 0xA116, + /** + * Current scratch limit threshold in bytes for this agent. This limit can be + * modified using the hsa_amd_agent_set_async_scratch_limit call. + * - AQL dispatches that require scratch-memory above this threshold will trigger a + * scratch use-once. + * - AQL dispatches using less scratch-memory than this threshold, ROCr will + * permanently assign the allocated scratch memory to the queue handling the dispatch. + * This memory can be reclaimed by calling hsa_amd_agent_set_async_scratch_limit + * with a lower threshold by current value. + * + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_CURRENT = 0xA117, + /** + * Queries the driver for clock counters of the agent. + * The type of this attribute is hsa_amd_clock_counters_t. + */ + HSA_AMD_AGENT_INFO_CLOCK_COUNTERS = 0xA118 +} hsa_amd_agent_info_t; + +/** + * @brief Agent memory properties attributes + */ +typedef enum hsa_amd_agent_memory_properties_s { + HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU = (1 << 0), +} hsa_amd_agent_memory_properties_t; + +/** + * @brief SDMA engine IDs unique by single set bit position. + */ +typedef enum hsa_amd_sdma_engine_id { + HSA_AMD_SDMA_ENGINE_0 = 0x1, + HSA_AMD_SDMA_ENGINE_1 = 0x2, + HSA_AMD_SDMA_ENGINE_2 = 0x4, + HSA_AMD_SDMA_ENGINE_3 = 0x8, + HSA_AMD_SDMA_ENGINE_4 = 0x10, + HSA_AMD_SDMA_ENGINE_5 = 0x20, + HSA_AMD_SDMA_ENGINE_6 = 0x40, + HSA_AMD_SDMA_ENGINE_7 = 0x80, + HSA_AMD_SDMA_ENGINE_8 = 0x100, + HSA_AMD_SDMA_ENGINE_9 = 0x200, + HSA_AMD_SDMA_ENGINE_10 = 0x400, + HSA_AMD_SDMA_ENGINE_11 = 0x800, + HSA_AMD_SDMA_ENGINE_12 = 0x1000, + HSA_AMD_SDMA_ENGINE_13 = 0x2000, + HSA_AMD_SDMA_ENGINE_14 = 0x4000, + HSA_AMD_SDMA_ENGINE_15 = 0x8000 +} hsa_amd_sdma_engine_id_t; + +typedef struct hsa_amd_hdp_flush_s { + uint32_t* HDP_MEM_FLUSH_CNTL; + uint32_t* HDP_REG_FLUSH_CNTL; +} hsa_amd_hdp_flush_t; + +/** + * @brief Region attributes. + */ +#ifdef __cplusplus +typedef enum hsa_amd_region_info_s : int { +#else +typedef enum hsa_amd_region_info_s { +#endif + /** + * Determine if host can access the region. The type of this attribute + * is bool. + */ + HSA_AMD_REGION_INFO_HOST_ACCESSIBLE = 0xA000, + /** + * Base address of the region in flat address space. + */ + HSA_AMD_REGION_INFO_BASE = 0xA001, + /** + * Memory Interface width, the return value type is uint32_t. + * This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_WIDTH. + */ + HSA_AMD_REGION_INFO_BUS_WIDTH = 0xA002, + /** + * Max Memory Clock, the return value type is uint32_t. + * This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY. + */ + HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY = 0xA003, +} hsa_amd_region_info_t; + +/** + * @brief Coherency attributes of fine grain region. + */ +typedef enum hsa_amd_coherency_type_s { + /** + * Coherent region. + */ + HSA_AMD_COHERENCY_TYPE_COHERENT = 0, + /** + * Non coherent region. + */ + HSA_AMD_COHERENCY_TYPE_NONCOHERENT = 1 +} hsa_amd_coherency_type_t; + + +/** + * @brief dmabuf attributes + */ +#ifdef __cplusplus +typedef enum hsa_amd_dma_buf_mapping_type_s : int { +#else +typedef enum hsa_amd_dma_buf_mapping_type_s { +#endif + HSA_AMD_DMABUF_MAPPING_TYPE_NONE = 0, + HSA_AMD_DMABUF_MAPPING_TYPE_PCIE = 1 +} hsa_amd_dma_buf_mapping_type_t; +/** + * @brief Get the coherency type of the fine grain region of an agent. + * + * @param[in] agent A valid agent. + * + * @param[out] type Pointer to a memory location where the HSA runtime will + * store the coherency type of the fine grain region. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is NULL. + */ +hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent, + hsa_amd_coherency_type_t* type); + +/** + * @brief Set the coherency type of the fine grain region of an agent. + * Deprecated. This is supported on KV platforms. For backward compatibility + * other platforms will spuriously succeed. + * + * @param[in] agent A valid agent. + * + * @param[in] type The coherency type to be set. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is invalid. + */ +hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent, + hsa_amd_coherency_type_t type); + +/** @} */ + +/** \defgroup profile Profiling + * @{ + */ + +/** + * @brief Structure containing profiling dispatch time information. + * + * Times are reported as ticks in the domain of the HSA system clock. + * The HSA system clock tick and frequency is obtained via hsa_system_get_info. + */ +typedef struct hsa_amd_profiling_dispatch_time_s { + /** + * Dispatch packet processing start time. + */ + uint64_t start; + /** + * Dispatch packet completion time. + */ + uint64_t end; +} hsa_amd_profiling_dispatch_time_t; + +/** + * @brief Structure containing profiling async copy time information. + * + * Times are reported as ticks in the domain of the HSA system clock. + * The HSA system clock tick and frequency is obtained via hsa_system_get_info. + */ +typedef struct hsa_amd_profiling_async_copy_time_s { + /** + * Async copy processing start time. + */ + uint64_t start; + /** + * Async copy completion time. + */ + uint64_t end; +} hsa_amd_profiling_async_copy_time_t; + +/** + * @brief Enable or disable profiling capability of a queue. + * + * @param[in] queue A valid queue. + * + * @param[in] enable 1 to enable profiling. 0 to disable profiling. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API + hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable); + +/** + * @brief Enable or disable asynchronous memory copy profiling. + * + * @details The runtime will provide the copy processing start timestamp and + * completion timestamp of each call to hsa_amd_memory_async_copy if the + * async copy profiling is enabled prior to the call to + * hsa_amd_memory_async_copy. The completion signal object is used to + * hold the last async copy start and end timestamp. The client can retrieve + * these timestamps via call to hsa_amd_profiling_get_async_copy_time. + * + * @param[in] enable True to enable profiling. False to disable profiling. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed on allocating resources + * needed to profile the asynchronous copy. + */ +hsa_status_t HSA_API + hsa_amd_profiling_async_copy_enable(bool enable); + +/** + * @brief Retrieve packet processing time stamps. + * + * @param[in] agent The agent with which the signal was last used. For + * instance, if the profiled dispatch packet is dispatched onto queue Q, + * which was created on agent A, then this parameter must be A. + * + * @param[in] signal A signal used as the completion signal of the dispatch + * packet to retrieve time stamps from. This dispatch packet must have been + * issued to a queue with profiling enabled and have already completed. Also + * the signal must not have yet been used in any other packet following the + * completion of the profiled dispatch packet. + * + * @param[out] time Packet processing timestamps in the HSA system clock + * domain. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL. + */ +hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time( + hsa_agent_t agent, hsa_signal_t signal, + hsa_amd_profiling_dispatch_time_t* time); + +/** + * @brief Retrieve asynchronous copy timestamps. + * + * @details Async copy profiling is enabled via call to + * hsa_amd_profiling_async_copy_enable. + * + * @param[in] signal A signal used as the completion signal of the call to + * hsa_amd_memory_async_copy. + * + * @param[out] time Async copy processing timestamps in the HSA system clock + * domain. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL. + */ +hsa_status_t HSA_API hsa_amd_profiling_get_async_copy_time( + hsa_signal_t signal, hsa_amd_profiling_async_copy_time_t* time); + +/** + * @brief Computes the frequency ratio and offset between the agent clock and + * HSA system clock and converts the agent's tick to HSA system domain tick. + * + * @param[in] agent The agent used to retrieve the agent_tick. It is user's + * responsibility to make sure the tick number is from this agent, otherwise, + * the behavior is undefined. + * + * @param[in] agent_tick The tick count retrieved from the specified @p agent. + * + * @param[out] system_tick The translated HSA system domain clock counter tick. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p system_tick is NULL; + */ +hsa_status_t HSA_API + hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent, + uint64_t agent_tick, + uint64_t* system_tick); + +/** @} */ + +/** \defgroup status Runtime notifications + * @{ + */ + +/** + * @brief Signal attribute flags. + */ +typedef enum { + /** + * Signal will only be consumed by AMD GPUs. Limits signal consumption to + * AMD GPU agents only. Ignored if @p num_consumers is not zero (all agents). + */ + HSA_AMD_SIGNAL_AMD_GPU_ONLY = 1, + /** + * Signal may be used for interprocess communication. + * IPC signals can be read, written, and waited on from any process. + * Profiling using an IPC enabled signal is only supported in a single process + * at a time. Producing profiling data in one process and consuming it in + * another process is undefined. + */ + HSA_AMD_SIGNAL_IPC = 2, +} hsa_amd_signal_attribute_t; + +/** + * @brief Create a signal with specific attributes. + * + * @param[in] initial_value Initial value of the signal. + * + * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that + * any agent might wait on the signal. + * + * @param[in] consumers List of agents that might consume (wait on) the + * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the + * HSA runtime might use the list to optimize the handling of the signal + * object. If an agent not listed in @p consumers waits on the returned + * signal, the behavior is undefined. The memory associated with @p consumers + * can be reused or freed after the function returns. + * + * @param[in] attributes Requested signal attributes. Multiple signal attributes + * may be requested by combining them with bitwise OR. Requesting no attributes + * (@p attributes == 0) results in the same signal as would have been obtained + * via hsa_signal_create. + * + * @param[out] signal Pointer to a memory location where the HSA runtime will + * store the newly created signal handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p + * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers + * contains duplicates. + */ +hsa_status_t HSA_API hsa_amd_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers, + const hsa_agent_t* consumers, uint64_t attributes, + hsa_signal_t* signal); + +/** + * @brief Returns a pointer to the value of a signal. + * + * Use of this API does not modify the lifetime of ::signal and any + * hsa_signal_value_t retrieved by this API has lifetime equal to that of + * ::signal. + * + * This API is intended for partial interoperability with non-HSA compatible + * devices and should not be used where HSA interfaces are available. + * + * Use of the signal value must comply with use restritions of ::signal. + * Use may result in data races if the operations performed are not platform + * atomic. Use with HSA_AMD_SIGNAL_AMD_GPU_ONLY or HSA_AMD_SIGNAL_IPC + * attributed signals is required. + * + * @param[in] Signal handle to extract the signal value pointer from. + * + * @param[out] Location where the extracted signal value pointer will be placed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT value_ptr is NULL. + */ +hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal, + volatile hsa_signal_value_t** value_ptr); + +/** + * @brief Asyncronous signal handler function type. + * + * @details Type definition of callback function to be used with + * hsa_amd_signal_async_handler. This callback is invoked if the associated + * signal and condition are met. The callback receives the value of the signal + * which satisfied the associated wait condition and a user provided value. If + * the callback returns true then the callback will be called again if the + * associated signal and condition are satisfied again. If the callback returns + * false then it will not be called again. + * + * @param[in] value Contains the value of the signal observed by + * hsa_amd_signal_async_handler which caused the signal handler to be invoked. + * + * @param[in] arg Contains the user provided value given when the signal handler + * was registered with hsa_amd_signal_async_handler + * + * @retval true resumes monitoring the signal with this handler (as if calling + * hsa_amd_signal_async_handler again with identical parameters) + * + * @retval false stops monitoring the signal with this handler (handler will + * not be called again for this signal) + * + */ +typedef bool (*hsa_amd_signal_handler)(hsa_signal_value_t value, void* arg); + +/** + * @brief Register asynchronous signal handler function. + * + * @details Allows registering a callback function and user provided value with + * a signal and wait condition. The callback will be invoked if the associated + * signal and wait condition are satisfied. Callbacks will be invoked serially + * but in an arbitrary order so callbacks should be independent of each other. + * After being invoked a callback may continue to wait for its associated signal + * and condition and, possibly, be invoked again. Or the callback may stop + * waiting. If the callback returns true then it will continue waiting and may + * be called again. If false then the callback will not wait again and will not + * be called again for the associated signal and condition. It is possible to + * register the same callback multiple times with the same or different signals + * and/or conditions. Each registration of the callback will be treated entirely + * independently. + * + * @param[in] signal hsa signal to be asynchronously monitored + * + * @param[in] cond condition value to monitor for + * + * @param[in] value signal value used in condition expression + * + * @param[in] handler asynchronous signal handler invoked when signal's + * condition is met + * + * @param[in] arg user provided value which is provided to handler when handler + * is invoked + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL) + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of + * resources or blocking signals are not supported by the HSA driver component. + * + */ +hsa_status_t HSA_API + hsa_amd_signal_async_handler(hsa_signal_t signal, + hsa_signal_condition_t cond, + hsa_signal_value_t value, + hsa_amd_signal_handler handler, void* arg); + +/** + * @brief Wait for all signal-condition pairs to be satisfied. + * + * @details Allows waiting for all of several signal and condition pairs to be + * satisfied. The function returns 0 if all signals met their conditions and -1 + * on a timeout. The value of each signal's satisfying value is returned in + * satisfying_value unless satisfying_value is nullptr. NULL and invalid signals + * are considered to have value 0 and their conditions already satisfied. This + * function provides only relaxed memory semantics. + */ +uint32_t HSA_API hsa_amd_signal_wait_all(uint32_t signal_count, hsa_signal_t* signals, + hsa_signal_condition_t* conds, hsa_signal_value_t* values, + uint64_t timeout_hint, hsa_wait_state_t wait_hint, + hsa_signal_value_t* satisfying_values); + +/** + * @brief Wait for any signal-condition pair to be satisfied. + * + * @details Allows waiting for any of several signal and conditions pairs to be + * satisfied. The function returns the index into the list of signals of the + * first satisfying signal-condition pair. The function returns + * std::numeric_limits::max() if no valid signal is provided. The value + * of the satisfying signal's value is returned in satisfying_value, unless + * satisfying_value is nullptr or there's no valid signal in the signal-condition + * pairs. NULL and invalid signals are ignored. This function provides only + * relaxed memory semantics. + */ +uint32_t HSA_API + hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals, + hsa_signal_condition_t* conds, + hsa_signal_value_t* values, uint64_t timeout_hint, + hsa_wait_state_t wait_hint, + hsa_signal_value_t* satisfying_value); + +/** @} */ + +/** + * @brief Call a function asynchronously + * + * @details Provides access to the runtime's asynchronous event handling thread + * for general asynchronous functions. Functions queued this way are executed + * in the same manner as if they were a signal handler who's signal is + * satisfied. + * + * @param[in] callback asynchronous function to be invoked + * + * @param[in] arg user provided value which is provided to handler when handler + * is invoked + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL) + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of + * resources or blocking signals are not supported by the HSA driver component. + * + */ +hsa_status_t HSA_API + hsa_amd_async_function(void (*callback)(void* arg), void* arg); + +/** \addtogroup ext-images Images and samplers + * @{ + */ + +/** + * @brief Encodes an opaque vendor specific image format. The length of data + * depends on the underlying format. This structure must not be copied as its + * true length can not be determined. + */ +typedef struct hsa_amd_image_descriptor_s { + /* + Version number of the descriptor + */ + uint32_t version; + + /* + Vendor and device PCI IDs for the format as VENDOR_ID<<16|DEVICE_ID. + */ + uint32_t deviceID; + + /* + Start of vendor specific data. + */ + uint32_t data[1]; +} hsa_amd_image_descriptor_t; + +/** + * @brief Creates an image from an opaque vendor specific image format. + * Does not modify data at image_data. Intended initially for + * accessing interop images. + * + * @param agent[in] Agent on which to create the image + * + * @param[in] image_descriptor[in] Vendor specific image format + * + * @param[in] image_data Pointer to image backing store + * + * @param[in] access_permission Access permissions for the image object + * + * @param[out] image Created image object. + * + * @retval HSA_STATUS_SUCCESS Image created successfully + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT Bad or mismatched descriptor, + * null image_data, or mismatched access_permission. + */ +hsa_status_t HSA_API hsa_amd_image_create( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const hsa_amd_image_descriptor_t *image_layout, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image +); + +/** + * @brief Query image limits. + * + * @param[in] agent A valid agent. + * + * @param[in] attribute HSA image info attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p value is NULL or @p attribute < + * HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS or @p attribute > + * HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS. + * + */ +hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +/** @} */ + +/** \addtogroup queue Queues + * @{ + */ + +/** + * @brief Set a queue's CU affinity mask. + * + * @details Enables the queue to run on only selected CUs. The given mask is + * combined by bitwise AND with any device wide mask in HSA_CU_MASK before + * being applied. + * If num_cu_mask_count is 0 then the request is interpreted as a request to + * enable all CUs and no cu_mask array need be given. + * + * @param[in] queue A pointer to HSA queue. + * + * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits. + * + * @param[in] cu_mask Bit-vector representing the CU mask. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed + * but the given mask attempted to enable a CU which was disabled by + * HSA_CU_MASK. CUs disabled by HSA_CU_MASK remain disabled. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not + * a multiple of 32 or @p num_cu_mask_count is not 0 and cu_mask is NULL. + * Devices with work group processors must even-index contiguous pairwise + * CU enable e.g. 0x33(b'110011) is valid while 0x5(0x101) and 0x6(b'0110) + * are invalid. + * + */ +hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, + uint32_t num_cu_mask_count, + const uint32_t* cu_mask); + +/** + * @brief Retrieve a queue's CU affinity mask. + * + * @details Returns the first num_cu_mask_count bits of a queue's CU mask. + * Ensure that num_cu_mask_count is at least as large as + * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask. + * + * @param[in] queue A pointer to HSA queue. + * + * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits. + * + * @param[out] cu_mask Bit-vector representing the CU mask. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not + * a multiple of 32 or @p cu_mask is NULL. + * + */ +hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, + uint32_t* cu_mask); + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Memory segments associated with a memory pool. + */ +typedef enum { + /** + * Global segment. Used to hold data that is shared by all agents. + */ + HSA_AMD_SEGMENT_GLOBAL = 0, + /** + * Read-only segment. Used to hold data that remains constant during the + * execution of a kernel. + */ + HSA_AMD_SEGMENT_READONLY = 1, + /** + * Private segment. Used to hold data that is local to a single work-item. + */ + HSA_AMD_SEGMENT_PRIVATE = 2, + /** + * Group segment. Used to hold data that is shared by the work-items of a + * work-group. + */ + HSA_AMD_SEGMENT_GROUP = 3, +} hsa_amd_segment_t; + +/** + * @brief A memory pool encapsulates physical storage on an agent + * along with a memory access model. + * + * @details A memory pool encapsulates a physical partition of an agent's + * memory system along with a memory access model. Division of a single + * memory system into separate pools allows querying each partition's access + * path properties (see ::hsa_amd_agent_memory_pool_get_info). Allocations + * from a pool are preferentially bound to that pool's physical partition. + * Binding to the pool's preferential physical partition may not be + * possible or persistent depending on the system's memory policy + * and/or state which is beyond the scope of HSA APIs. + * + * For example, a multi-node NUMA memory system may be represented by multiple + * pool's with each pool providing size and access path information for the + * partition it represents. Allocations from a pool are preferentially bound + * to the pool's partition (which in this example is a NUMA node) while + * following its memory access model. The actual placement may vary or migrate + * due to the system's NUMA policy and state, which is beyond the scope of + * HSA APIs. + */ +typedef struct hsa_amd_memory_pool_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_amd_memory_pool_t; + +typedef enum hsa_amd_memory_pool_global_flag_s { + /** + * The application can use allocations in the memory pool to store kernel + * arguments, and provide the values for the kernarg segment of + * a kernel dispatch. + */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1, + /** + * Updates to memory in this pool conform to HSA memory consistency model. + * If this flag is set, then ::HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED + * must not be set. + */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2, + /** + * Writes to memory in this pool can be performed by a single agent at a time. + */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4, + + /** Updates to memory in this memory pool have extended scope, acting as + * system-scope atomics for variables in memory regions of this type. + * Note: On non-compliant systems, device-specific actions may be required + * for system-scope coherence. */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8, + +} hsa_amd_memory_pool_global_flag_t; + +typedef enum hsa_amd_memory_pool_location_s { + /** + * This memory pool resides on the host (CPU) + */ + HSA_AMD_MEMORY_POOL_LOCATION_CPU = 0, + /** + * This memory pool resides on a GPU + */ + HSA_AMD_MEMORY_POOL_LOCATION_GPU = 1 +} hsa_amd_memory_pool_location_t; + +/** + * @brief Memory pool features. + */ +typedef enum { + /** + * Segment where the memory pool resides. The type of this attribute is + * ::hsa_amd_segment_t. + */ + HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0, + /** + * Flag mask. The value of this attribute is undefined if the value of + * ::HSA_AMD_MEMORY_POOL_INFO_SEGMENT is not ::HSA_AMD_SEGMENT_GLOBAL. The type + * of + * this attribute is uint32_t, a bit-field of + * ::hsa_amd_memory_pool_global_flag_t + * values. + */ + HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1, + /** + * Size of this pool, in bytes. The type of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_SIZE = 2, + /** + * Indicates whether memory in this pool can be allocated using + * ::hsa_amd_memory_pool_allocate. The type of this attribute is bool. + * + * The value of this flag is always false for memory pools in the group and + * private segments. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5, + /** + * Allocation granularity of buffers allocated by + * ::hsa_amd_memory_pool_allocate + * in this memory pool. The size of a buffer allocated in this pool is a + * multiple of the value of this attribute. While this is the minimum size of + * allocation allowed, it is recommened to use + * HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE to obtain the recommended + * allocation granularity size for this pool. + * The value of this attribute is only defined if + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for + * this pool. The type of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6, + /** + * Alignment of buffers allocated by ::hsa_amd_memory_pool_allocate in this + * pool. The value of this attribute is only defined if + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool, and + * must be a power of 2. The type of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7, + /** + * This memory_pool can be made directly accessible by all the agents in the + * system (::hsa_amd_agent_memory_pool_get_info does not return + * ::HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED for any agent). The type of this + * attribute is bool. + */ + HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15, + /** + * Maximum aggregate allocation size in bytes. The type of this attribute + * is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16, + /** + * Location of this memory pool. The type of this attribute + * is hsa_amd_memory_pool_location_t. + */ + HSA_AMD_MEMORY_POOL_INFO_LOCATION = 17, + /** + * Internal block size for allocations. This would also be the recommended + * granularity size for allocations as this prevents internal fragmentation. + * The value of this attribute is only defined if + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool. + * The size of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE = 18, +} hsa_amd_memory_pool_info_t; + +/** + * @brief Memory pool flag used to specify allocation directives + * + */ +typedef enum hsa_amd_memory_pool_flag_s { + /** + * Allocates memory that conforms to standard HSA memory consistency model + */ + HSA_AMD_MEMORY_POOL_STANDARD_FLAG = 0, + /** + * Allocates fine grain memory type where memory ordering is per point to point + * connection. Atomic memory operations on these memory buffers are not + * guaranteed to be visible at system scope. + */ + HSA_AMD_MEMORY_POOL_PCIE_FLAG = (1 << 0), + /** + * Allocates physically contiguous memory + */ + HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG = (1 << 1), + /** + * Allocates executable memory + */ + HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG = (1 << 2), + +} hsa_amd_memory_pool_flag_t; + +/** + * @brief Get the current value of an attribute of a memory pool. + * + * @param[in] memory_pool A valid memory pool. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + */ +hsa_status_t HSA_API + hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool, + hsa_amd_memory_pool_info_t attribute, + void* value); + +/** + * @brief Iterate over the memory pools associated with a given agent, and + * invoke an application-defined callback on every iteration. + * + * @details An agent can directly access buffers located in some memory pool, or + * be enabled to access them by the application (see ::hsa_amd_agents_allow_access), + * yet that memory pool may not be returned by this function for that given + * agent. + * + * A memory pool of fine-grained type must be associated only with the host. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked on the same thread that called + * ::hsa_amd_agent_iterate_memory_pools, serially, once per memory pool that is + * associated with the agent. The HSA runtime passes two arguments to the + * callback: the memory pool, and the application data. If @p callback + * returns a status other than ::HSA_STATUS_SUCCESS for a particular iteration, + * the traversal stops and ::hsa_amd_agent_iterate_memory_pools returns that status + * value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data), + void* data); + +/** + * @brief Allocate a block of memory (or buffer) in the specified pool. + * + * @param[in] memory_pool Memory pool where to allocate memory from. The memory + * pool must have the ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED flag set. + * + * @param[in] size Allocation size, in bytes. Must not be zero. This value is + * rounded up to the nearest multiple of + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool. + * + * @param[in] flags A bit-field that is used to specify allocation + * directives. + * + * @param[out] ptr Pointer to the location where to store the base virtual + * address of + * the allocated block. The returned base address is aligned to the value of + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT in @p memory_pool. If the + * allocation fails, the returned value is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES No memory is available. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The memory pool is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to + * allocate memory in @p memory_pool, or @p size is greater than + * the value of HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE in @p memory_pool. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0, + * or flags is not 0. + * + */ +hsa_status_t HSA_API + hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size, + uint32_t flags, void** ptr); + +/** + * @brief Deallocate a block of memory previously allocated using + * ::hsa_amd_memory_pool_allocate. + * + * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value + * previously returned by ::hsa_amd_memory_pool_allocate, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); + +/** + * @brief Asynchronously copy a block of memory from the location pointed to by + * @p src on the @p src_agent to the memory block pointed to by @p dst on the @p + * dst_agent. + * Because the DMA engines used may not be in the same coherency domain, the caller must ensure + * that buffers are system-level coherent. In general this requires the sending device to have + * released the buffer to system scope prior to executing the copy API and the receiving device + * must execute a system scope acquire fence prior to use of the destination buffer. + * + * @param[out] dst Buffer where the content is to be copied. + * + * @param[in] dst_agent Agent associated with the @p dst. The agent must be able to directly + * access both the source and destination buffers in their current locations. + * May be zero in which case the runtime will attempt to discover the destination agent. + * Discovery may have variable and/or high latency. + * + * @param[in] src A valid pointer to the source of data to be copied. The source + * buffer must not overlap with the destination buffer, otherwise the copy will succeed + * but contents of @p dst is undefined. + * + * @param[in] src_agent Agent associated with the @p src. The agent must be able to directly + * access both the source and destination buffers in their current locations. + * May be zero in which case the runtime will attempt to discover the destination agent. + * Discovery may have variable and/or high latency. + * + * @param[in] size Number of bytes to copy. If @p size is 0, no copy is + * performed and the function returns success. Copying a number of bytes larger + * than the size of the buffers pointed by @p dst or @p src results in undefined + * behavior. + * + * @param[in] num_dep_signals Number of dependent signals. Can be 0. + * + * @param[in] dep_signals List of signals that must be waited on before the copy + * operation starts. The copy will start after every signal has been observed with + * the value 0. The dependent signal should not include completion signal from + * hsa_amd_memory_async_copy operation to be issued in future as that can result + * in a deadlock. If @p num_dep_signals is 0, this argument is ignored. + * + * @param[in] completion_signal Signal used to indicate completion of the copy + * operation. When the copy operation is finished, the value of the signal is + * decremented. The runtime indicates that an error has occurred during the copy + * operation by setting the value of the completion signal to a negative + * number. The signal handle must not be 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. The + * application is responsible for checking for asynchronous error conditions + * (see the description of @p completion_signal). + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT An agent is invalid or no discovered agent has access. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p completion_signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination + * pointers are NULL, or the completion signal is 0. + */ +hsa_status_t HSA_API + hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + +/** + * @brief Asynchronously copy a block of memory from the location pointed to by + * @p src on the @p src_agent to the memory block pointed to by @p dst on the @p + * dst_agent on engine_id. + * + * WARNING: Concurrent use of this call with hsa_amd_memory_async_copy can result + * in resource conflicts as HSA runtime will auto assign engines with the latter + * call. Approach using both calls concurrently with caution. + * + * All param definitions are identical to hsa_amd_memory_async_copy with the + * exception of engine_id and force_copy_on_sdma. + * + * @param[in] - engine_id Target engine defined by hsa_amd_sdma_engine_id_t. + * Client should use hsa_amd_memory_copy_engine_status first to get the ID + * availability. + * + * @param[in] - force_copy_on_sdma By default, blit kernel copies are used when + * dst_agent == src_agent. Setting this to true will force the copy over SDMA1. + * + * All return definitions are identical to hsa_amd_memory_async_copy with the + * following ammendments: + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination + * pointers are NULL, or the completion signal is 0 or engine_id is improperly + * bounded. + */ +hsa_status_t HSA_API + hsa_amd_memory_async_copy_on_engine(void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal, + hsa_amd_sdma_engine_id_t engine_id, + bool force_copy_on_sdma); +/** + * @brief Reports the availability of SDMA copy engines. + * + * @param[in] dst_agent Destination agent of copy status direction. + * + * @param[in] src_agent Source agent of copy status direction. + * + * @param[out] engine_ids_mask returns available SDMA engine IDs that can be masked + * with hsa_amd_sdma_engine_id_t. + * + * @retval ::HSA_STATUS_SUCCESS Agent has available SDMA engines. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Agent does not have available SDMA engines. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT dst_agent and src_agent are the same as + * dst_agent == src_agent is generally used for shader copies. + */ +hsa_status_t HSA_API +hsa_amd_memory_copy_engine_status(hsa_agent_t dst_agent, hsa_agent_t src_agent, + uint32_t *engine_ids_mask); + /** + * @brief Returns the preferred SDMA engine mask. + * + * @param[in] dst_agent Destination agent of copy status direction. + * + * @param[in] src_agent Source agent of copy status direction. + * + * @param[out] recommended_ids_mask returns available SDMA engine IDs for max bandwidth + * that can be masked with hsa_amd_sdma_engine_id_t. Can be 0 if there is no preference + * + * @retval ::HSA_STATUS_SUCCESS For mask returned + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT dst_agent and src_agent are the same as + * dst_agent == src_agent is generally used for shader copies. + */ +hsa_status_t HSA_API +hsa_amd_memory_get_preferred_copy_engine(hsa_agent_t dst_agent, hsa_agent_t src_agent, + uint32_t* recommended_ids_mask); + +/* +[Provisional API] +Pitched memory descriptor. +All elements must be 4 byte aligned. Pitch and slice are in bytes. +*/ +typedef struct hsa_pitched_ptr_s { + void* base; + size_t pitch; + size_t slice; +} hsa_pitched_ptr_t; + +/* +[Provisional API] +Copy direction flag. +*/ +typedef enum { + hsaHostToHost = 0, + hsaHostToDevice = 1, + hsaDeviceToHost = 2, + hsaDeviceToDevice = 3 +} hsa_amd_copy_direction_t; + +/* +[Provisional API] +SDMA 3D memory copy API. The same requirements must be met by src and dst as in +hsa_amd_memory_async_copy. +Both src and dst must be directly accessible to the copy_agent during the copy, src and dst rects +must not overlap. +CPU agents are not supported. API requires SDMA and will return an error if SDMA is not available. +Offsets and range carry x in bytes, y and z in rows and layers. +*/ +hsa_status_t HSA_API hsa_amd_memory_async_copy_rect( + const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent, + hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + +/** + * @brief Type of accesses to a memory pool from a given agent. + */ +typedef enum { + /** + * The agent cannot directly access any buffer in the memory pool. + */ + HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0, + /** + * The agent can directly access a buffer located in the pool; the application + * does not need to invoke ::hsa_amd_agents_allow_access. + */ + HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT = 1, + /** + * The agent can directly access a buffer located in the pool, but only if the + * application has previously requested access to that buffer using + * ::hsa_amd_agents_allow_access. + */ + HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT = 2 +} hsa_amd_memory_pool_access_t; + +/** + * @brief Properties of the relationship between an agent a memory pool. + */ +typedef enum { + /** + * Hyper-transport bus type. + */ + HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0, + + /** + * QPI bus type. + */ + HSA_AMD_LINK_INFO_TYPE_QPI = 1, + + /** + * PCIe bus type. + */ + HSA_AMD_LINK_INFO_TYPE_PCIE = 2, + + /** + * Infiniband bus type. + */ + HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3, + + /** + * xGMI link type. + */ + HSA_AMD_LINK_INFO_TYPE_XGMI = 4 + +} hsa_amd_link_info_type_t; + +/** + * @brief Link properties when accessing the memory pool from the specified + * agent. + */ +typedef struct hsa_amd_memory_pool_link_info_s { + /** + * Minimum transfer latency (rounded to ns). + */ + uint32_t min_latency; + + /** + * Maximum transfer latency (rounded to ns). + */ + uint32_t max_latency; + + /** + * Minimum link interface bandwidth in MB/s. + */ + uint32_t min_bandwidth; + + /** + * Maximum link interface bandwidth in MB/s. + */ + uint32_t max_bandwidth; + + /** + * Support for 32-bit atomic transactions. + */ + bool atomic_support_32bit; + + /** + * Support for 64-bit atomic transactions. + */ + bool atomic_support_64bit; + + /** + * Support for cache coherent transactions. + */ + bool coherent_support; + + /** + * The type of bus/link. + */ + hsa_amd_link_info_type_t link_type; + + /** + * NUMA distance of memory pool relative to querying agent + */ + uint32_t numa_distance; +} hsa_amd_memory_pool_link_info_t; + +/** + * @brief Properties of the relationship between an agent a memory pool. + */ +typedef enum { + /** + * Access to buffers located in the memory pool. The type of this attribute + * is ::hsa_amd_memory_pool_access_t. + * + * An agent can always directly access buffers currently located in a memory + * pool that is associated (the memory_pool is one of the values returned by + * ::hsa_amd_agent_iterate_memory_pools on the agent) with that agent. If the + * buffer is currently located in a memory pool that is not associated with + * the agent, and the value returned by this function for the given + * combination of agent and memory pool is not + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, the application still needs to invoke + * ::hsa_amd_agents_allow_access in order to gain direct access to the buffer. + * + * If the given agent can directly access buffers the pool, the result is not + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is associated with + * the agent, or it is of fined-grained type, the result must not be + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is not associated + * with the agent, and does not reside in the global segment, the result must + * be HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. + */ + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0, + + /** + * Number of links to hop when accessing the memory pool from the specified + * agent. The value of this attribute is zero if the memory pool is associated + * with the agent, or if the access type is + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. The type of this attribute is + * uint32_t. + */ + HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS = 1, + + /** + * Details of each link hop when accessing the memory pool starting from the + * specified agent. The type of this attribute is an array size of + * HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS with each element containing + * ::hsa_amd_memory_pool_link_info_t. + */ + HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO = 2 + +} hsa_amd_agent_memory_pool_info_t; + +/** + * @brief Get the current value of an attribute of the relationship between an + * agent and a memory pool. + * + * @param[in] agent Agent. + * + * @param[in] memory_pool Memory pool. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + */ +hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info( + hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool, + hsa_amd_agent_memory_pool_info_t attribute, void* value); + +/** + * @brief Enable direct access to a buffer from a given set of agents. + * + * @details + * + * Upon return, only the listed agents and the agent associated with the + * buffer's memory pool have direct access to the @p ptr. + * + * Any agent that has access to the buffer before and after the call to + * ::hsa_amd_agents_allow_access will also have access while + * ::hsa_amd_agents_allow_access is in progress. + * + * The caller is responsible for ensuring that each agent in the list + * must be able to access the memory pool containing @p ptr + * (using ::hsa_amd_agent_memory_pool_get_info with ::HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS attribute), + * otherwise error code is returned. + * + * @param[in] num_agents Size of @p agents. + * + * @param[in] agents List of agents. If @p num_agents is 0, this argument is + * ignored. + * + * @param[in] flags A list of bit-field that is used to specify access + * information in a per-agent basis. This is currently reserved and must be NULL. + * + * @param[in] ptr A buffer previously allocated using ::hsa_amd_memory_pool_allocate. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_agents is 0, or @p agents + * is NULL, @p flags is not NULL, or attempting to enable access to agent(s) + * because @p ptr is allocated from an inaccessible pool. + * + */ +hsa_status_t HSA_API + hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents, + const uint32_t* flags, const void* ptr); + +/** + * @brief Query if buffers currently located in some memory pool can be + * relocated to a destination memory pool. + * + * @details If the returned value is non-zero, a migration of a buffer to @p + * dst_memory_pool using ::hsa_amd_memory_migrate may nevertheless fail due to + * resource limitations. + * + * @param[in] src_memory_pool Source memory pool. + * + * @param[in] dst_memory_pool Destination memory pool. + * + * @param[out] result Pointer to a memory location where the result of the query + * is stored. Must not be NULL. If buffers currently located in @p + * src_memory_pool can be relocated to @p dst_memory_pool, the result is + * true. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL One of the memory pools is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API + hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool, + hsa_amd_memory_pool_t dst_memory_pool, + bool* result); + +/** + * @brief Relocate a buffer to a new memory pool. + * + * @details When a buffer is migrated, its virtual address remains the same but + * its physical contents are moved to the indicated memory pool. + * + * After migration, only the agent associated with the destination pool will have access. + * + * The caller is also responsible for ensuring that the allocation in the + * source memory pool where the buffer is currently located can be migrated to the + * specified destination memory pool (using ::hsa_amd_memory_pool_can_migrate returns a value of true + * for the source and destination memory pools), otherwise behavior is undefined. + * + * The caller must ensure that the buffer is not accessed while it is migrated. + * + * @param[in] ptr Buffer to be relocated. The buffer must have been released to system + * prior to call this API. The buffer will be released to system upon completion. + * + * @param[in] memory_pool Memory pool where to place the buffer. + * + * @param[in] flags A bit-field that is used to specify migration + * information. Must be zero. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The destination memory pool is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p flags is not 0. + */ +hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr, + hsa_amd_memory_pool_t memory_pool, + uint32_t flags); + +/** + * + * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and + * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously + * locked memory, then the overlap area is kept locked (i.e multiple mappings are permitted). In + * this case, the same input @p host_ptr may give different locked @p agent_ptr and when it does, + * they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent). + * Accesses to @p agent_ptr are coarse grained. + * + * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator. + * + * @param[in] size The size to be locked. + * + * @param[in] agents Array of agent handle to gain access to the @p host_ptr. + * If this parameter is NULL and the @p num_agent is 0, all agents + * in the platform will gain access to the @p host_ptr. + * + * @param[out] agent_ptr Pointer to the location where to store the new address. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or + * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents + * is NULL but @p num_agent is not 0. + */ +hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, + hsa_agent_t* agents, int num_agent, + void** agent_ptr); + +/** + * + * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and + * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously + * locked memory, then the overlap area is kept locked (i.e. multiple mappings are permitted). + * In this case, the same input @p host_ptr may give different locked @p agent_ptr and when it + * does, they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent). + * Acesses to the memory via @p agent_ptr have the same access properties as memory allocated from + * @p pool as determined by ::hsa_amd_memory_pool_get_info and ::hsa_amd_agent_memory_pool_get_info + * (ex. coarse/fine grain, platform atomic support, link info). Physical composition and placement + * of the memory (ex. page size, NUMA binding) is not changed. + * + * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator. + * + * @param[in] size The size to be locked. + * + * @param[in] agents Array of agent handle to gain access to the @p host_ptr. + * If this parameter is NULL and the @p num_agent is 0, all agents + * in the platform will gain access to the @p host_ptr. + * + * @param[in] pool Global memory pool owned by a CPU agent. + * + * @param[in] flags A bit-field that is used to specify allocation + * directives. Reserved parameter, must be 0. + * + * @param[out] agent_ptr Pointer to the location where to store the new address. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is + * invalid or can not access @p pool. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL @p pool is invalid or not owned + * by a CPU agent. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or + * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents + * is NULL but @p num_agent is not 0 or flags is not 0. + */ +hsa_status_t HSA_API hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_t* agents, + int num_agent, hsa_amd_memory_pool_t pool, + uint32_t flags, void** agent_ptr); + +/** + * + * @brief Unpin the host pointer previously pinned via ::hsa_amd_memory_lock or + * ::hsa_amd_memory_lock_to_pool. + * + * @details The behavior is undefined if the host pointer being unpinned does not + * match previous pinned address or if the host pointer was already deallocated. + * + * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator that was + * pinned previously via ::hsa_amd_memory_lock or ::hsa_amd_memory_lock_to_pool. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + */ +hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr); + +/** + * @brief Sets the first @p count of uint32_t of the block of memory pointed by + * @p ptr to the specified @p value. + * + * @param[in] ptr Pointer to the block of memory to fill. + * + * @param[in] value Value to be set. + * + * @param[in] count Number of uint32_t element to be set to the value. + * + * @retval HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or + * not 4 bytes aligned + * + * @retval HSA_STATUS_ERROR_INVALID_ALLOCATION if the given memory + * region was not allocated with HSA runtime APIs. + * + */ +hsa_status_t HSA_API + hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count); + +/** + * @brief Maps an interop object into the HSA flat address space and establishes + * memory residency. The metadata pointer is valid during the lifetime of the + * map (until hsa_amd_interop_unmap_buffer is called). + * Multiple calls to hsa_amd_interop_map_buffer with the same interop_handle + * result in multiple mappings with potentially different addresses and + * different metadata pointers. Concurrent operations on these addresses are + * not coherent. Memory must be fenced to system scope to ensure consistency, + * between mappings and with any views of this buffer in the originating + * software stack. + * + * @param[in] num_agents Number of agents which require access to the memory + * + * @param[in] agents List of accessing agents. + * + * @param[in] interop_handle Handle of interop buffer (dmabuf handle in Linux) + * + * @param [in] flags Reserved, must be 0 + * + * @param[out] size Size in bytes of the mapped object + * + * @param[out] ptr Base address of the mapped object + * + * @param[out] metadata_size Size of metadata in bytes, may be NULL + * + * @param[out] metadata Pointer to metadata, may be NULL + * + * @retval HSA_STATUS_SUCCESS if successfully mapped + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT all other errors + */ +hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents, + hsa_agent_t* agents, + int interop_handle, + uint32_t flags, + size_t* size, + void** ptr, + size_t* metadata_size, + const void** metadata); + +/** + * @brief Removes a previously mapped interop object from HSA's flat address space. + * Ends lifetime for the mapping's associated metadata pointer. + */ +hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr); + +/** + * @brief Denotes the type of memory in a pointer info query. + */ +typedef enum { + /* + Memory is not known to the HSA driver. Unallocated or unlocked system memory. + */ + HSA_EXT_POINTER_TYPE_UNKNOWN = 0, + /* + Memory was allocated with an HSA memory allocator. + */ + HSA_EXT_POINTER_TYPE_HSA = 1, + /* + System memory which has been locked for use with an HSA agent. + + Memory of this type is normal malloc'd memory and is always accessible to + the CPU. Pointer info queries may not include CPU agents in the accessible + agents list as the CPU has implicit access. + */ + HSA_EXT_POINTER_TYPE_LOCKED = 2, + /* + Memory originated in a graphics component and is shared with ROCr. + */ + HSA_EXT_POINTER_TYPE_GRAPHICS = 3, + /* + Memory has been shared with the local process via ROCr IPC APIs. + */ + HSA_EXT_POINTER_TYPE_IPC = 4, + /* + No backend memory but virtual address + */ + HSA_EXT_POINTER_TYPE_RESERVED_ADDR = 5 +} hsa_amd_pointer_type_t; + +/** + * @brief Describes a memory allocation known to ROCr. + * Within a ROCr major version this structure can only grow. + */ +typedef struct hsa_amd_pointer_info_s { + /* + Size in bytes of this structure. Used for version control within a major ROCr + revision. Set to sizeof(hsa_amd_pointer_t) prior to calling + hsa_amd_pointer_info. If the runtime supports an older version of pointer + info then size will be smaller on return. Members starting after the return + value of size will not be updated by hsa_amd_pointer_info. + */ + uint32_t size; + /* + The type of allocation referenced. + */ + hsa_amd_pointer_type_t type; + /* + Base address at which non-host agents may access the allocation. This field is + not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + void* agentBaseAddress; + /* + Base address at which the host agent may access the allocation. This field is + not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + void* hostBaseAddress; + /* + Size of the allocation. This field is not meaningful if the type of the allocation + is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + size_t sizeInBytes; + /* + Application provided value. This field is not meaningful if the type of the + allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + void* userData; + /* + Reports an agent which "owns" (ie has preferred access to) the pool in which the + allocation was + made. When multiple agents share equal access to a pool (ex: multiple CPU agents, or multi-die + GPU boards) any such agent may be returned. This field is not meaningful if + the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN or if this agent is not available in + this process, for e.g if this agent is masked using ROCR_VISIBLE_DEVICES. + */ + hsa_agent_t agentOwner; + /* + Contains a bitfield of hsa_amd_memory_pool_global_flag_t values. + Reports the effective global flags bitmask for the allocation. This field is not + meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + uint32_t global_flags; +} hsa_amd_pointer_info_t; + +/** + * @brief Retrieves information about the allocation referenced by the given + * pointer. Optionally returns the number and list of agents which can + * directly access the allocation. In case this virtual address is unknown, the + * pointer type returned will be HSA_EXT_POINTER_TYPE_UNKNOWN and the only fields + * that are valid after hsa_amd_pointer_info returns are size and type. + * + * @param[in] ptr Pointer which references the allocation to retrieve info for. + * + * @param[in, out] info Pointer to structure to be filled with allocation info. + * Data member size must be set to the size of the structure prior to calling + * hsa_amd_pointer_info. On return size will be set to the size of the + * pointer info structure supported by the runtime, if smaller. Members + * beyond the returned value of size will not be updated by the API. + * Must not be NULL. + * + * @param[in] alloc Function pointer to an allocator used to allocate the + * @p accessible array. If NULL @p accessible will not be returned. + * + * @param[out] num_agents_accessible Recieves the count of agents in + * @p accessible. If NULL @p accessible will not be returned. + * + * @param[out] accessible Recieves a pointer to the array, allocated by @p alloc, + * holding the list of agents which may directly access the allocation. + * May be NULL. + * + * @retval HSA_STATUS_SUCCESS Info retrieved successfully + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT NULL in @p ptr or @p info. + */ +hsa_status_t HSA_API hsa_amd_pointer_info(const void* ptr, + hsa_amd_pointer_info_t* info, + void* (*alloc)(size_t), + uint32_t* num_agents_accessible, + hsa_agent_t** accessible); + +/** + * @brief Associates an arbitrary pointer with an allocation known to ROCr. + * The pointer can be fetched by hsa_amd_pointer_info in the userData field. + * + * @param[in] ptr Pointer to the first byte of an allocation known to ROCr + * with which to associate @p userdata. + * + * @param[in] userdata Abitrary pointer to associate with the allocation. + * + * @retval HSA_STATUS_SUCCESS @p userdata successfully stored. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is not known to ROCr. + */ +hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(const void* ptr, + void* userdata); + +/** + * @brief 256-bit process independent identifier for a ROCr shared memory + * allocation. + */ +typedef struct hsa_amd_ipc_memory_s { + uint32_t handle[8]; +} hsa_amd_ipc_memory_t; + +/** + * @brief Prepares an allocation for interprocess sharing and creates a + * handle of type hsa_amd_ipc_memory_t uniquely identifying the allocation. A + * handle is valid while the allocation it references remains accessible in + * any process. In general applications should confirm that a shared memory + * region has been attached (via hsa_amd_ipc_memory_attach) in the remote + * process prior to releasing that memory in the local process. + * Repeated calls for the same allocation may, but are not required to, return + * unique handles. The allocation needs to be on memory on an agent of type + * HSA_DEVICE_TYPE_GPU. + * + * @param[in] ptr Pointer to device memory allocated via ROCr APIs to prepare for + * sharing. + * + * @param[in] len Length in bytes of the allocation to share. + * + * @param[out] handle Process independent identifier referencing the shared + * allocation. + * + * @retval HSA_STATUS_SUCCESS allocation is prepared for interprocess sharing. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr does not point to the + * first byte of an allocation made through ROCr, or len is not the full length + * of the allocation or handle is NULL. + */ +hsa_status_t HSA_API hsa_amd_ipc_memory_create(void* ptr, size_t len, + hsa_amd_ipc_memory_t* handle); + +/** + * @brief Imports shared memory into the local process and makes it accessible + * by the given agents. If a shared memory handle is attached multiple times + * in a process each attach may return a different address. Each returned + * address is refcounted and requires a matching number of calls to + * hsa_amd_ipc_memory_detach to release the shared memory mapping. + * + * @param[in] handle Pointer to the identifier for the shared memory. + * + * @param[in] len Length of the shared memory to import. + * Reserved. Must be the full length of the shared allocation in this version. + * + * @param[in] num_agents Count of agents in @p mapping_agents. + * May be zero if all agents are to be allowed access. + * + * @param[in] mapping_agents List of agents to access the shared memory. + * Ignored if @p num_agents is zero. + * + * @param[out] mapped_ptr Recieves a process local pointer to the shared memory. + * + * @retval HSA_STATUS_SUCCESS if memory is successfully imported. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid, @p len is + * incorrect, @p mapped_ptr is NULL, or some agent for which access was + * requested can not access the shared memory. + */ +hsa_status_t HSA_API hsa_amd_ipc_memory_attach( + const hsa_amd_ipc_memory_t* handle, size_t len, + uint32_t num_agents, + const hsa_agent_t* mapping_agents, + void** mapped_ptr); + +/** + * @brief Decrements the reference count for the shared memory mapping and + * releases access to shared memory imported with hsa_amd_ipc_memory_attach. + * + * @param[in] mapped_ptr Pointer to the first byte of a shared allocation + * imported with hsa_amd_ipc_memory_attach. + * + * @retval HSA_STATUS_SUCCESS if @p mapped_ptr was imported with + * hsa_amd_ipc_memory_attach. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p mapped_ptr was not imported + * with hsa_amd_ipc_memory_attach. + */ +hsa_status_t HSA_API hsa_amd_ipc_memory_detach(void* mapped_ptr); + +/** @} */ + +/** \addtogroup status Runtime notifications + * @{ + */ + +/** + * @brief 256-bit process independent identifier for a ROCr IPC signal. + */ +typedef hsa_amd_ipc_memory_t hsa_amd_ipc_signal_t; + +/** + * @brief Obtains an interprocess sharing handle for a signal. The handle is + * valid while the signal it references remains valid in any process. In + * general applications should confirm that the signal has been attached (via + * hsa_amd_ipc_signal_attach) in the remote process prior to destroying that + * signal in the local process. + * Repeated calls for the same signal may, but are not required to, return + * unique handles. + * + * @param[in] signal Signal created with attribute HSA_AMD_SIGNAL_IPC. + * + * @param[out] handle Process independent identifier referencing the shared + * signal. + * + * @retval HSA_STATUS_SUCCESS @p handle is ready to use for interprocess sharing. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is not a valid signal + * created with attribute HSA_AMD_SIGNAL_IPC or handle is NULL. + */ +hsa_status_t HSA_API hsa_amd_ipc_signal_create(hsa_signal_t signal, hsa_amd_ipc_signal_t* handle); + +/** + * @brief Imports an IPC capable signal into the local process. If an IPC + * signal handle is attached multiple times in a process each attach may return + * a different signal handle. Each returned signal handle is refcounted and + * requires a matching number of calls to hsa_signal_destroy to release the + * shared signal. + * + * @param[in] handle Pointer to the identifier for the shared signal. + * + * @param[out] signal Recieves a process local signal handle to the shared signal. + * + * @retval HSA_STATUS_SUCCESS if the signal is successfully imported. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid. + */ +hsa_status_t HSA_API hsa_amd_ipc_signal_attach(const hsa_amd_ipc_signal_t* handle, + hsa_signal_t* signal); + +/** + * @brief GPU system event type. + */ +typedef enum hsa_amd_event_type_s { + /* + AMD GPU memory fault. + */ + HSA_AMD_GPU_MEMORY_FAULT_EVENT = 0, + /* + AMD GPU HW Exception. + */ + HSA_AMD_GPU_HW_EXCEPTION_EVENT, + /* + AMD GPU memory error. + */ + HSA_AMD_GPU_MEMORY_ERROR_EVENT, +} hsa_amd_event_type_t; + +/** + * @brief Flags denoting the cause of a memory fault. + */ +typedef enum { + // Page not present or supervisor privilege. + HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT = 1 << 0, + // Write access to a read-only page. + HSA_AMD_MEMORY_FAULT_READ_ONLY = 1 << 1, + // Execute access to a page marked NX. + HSA_AMD_MEMORY_FAULT_NX = 1 << 2, + // GPU attempted access to a host only page. + HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3, + // DRAM ECC failure. + HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4, + // Can't determine the exact fault address. + HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5, + // SRAM ECC failure (ie registers, no fault address). + HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6, + // GPU reset following unspecified hang. + HSA_AMD_MEMORY_FAULT_HANG = 1U << 31 +} hsa_amd_memory_fault_reason_t; + +/** + * @brief AMD GPU memory fault event data. + */ +typedef struct hsa_amd_gpu_memory_fault_info_s { + /* + The agent where the memory fault occurred. + */ + hsa_agent_t agent; + /* + Virtual address accessed. + */ + uint64_t virtual_address; + /* + Bit field encoding the memory access failure reasons. There could be multiple bits set + for one fault. Bits are defined in hsa_amd_memory_fault_reason_t. + */ + uint32_t fault_reason_mask; +} hsa_amd_gpu_memory_fault_info_t; + +/** + * @brief Flags denoting the cause of a memory error. + */ +typedef enum { + // Memory was in use by low-level HW component and cannot be released + HSA_AMD_MEMORY_ERROR_MEMORY_IN_USE = (1 << 0), +} hsa_amd_memory_error_reason_t; + +/** + * @brief AMD GPU memory error event data. + */ +typedef struct hsa_amd_gpu_memory_error_info_s { + /* + The agent where the memory error occurred. + */ + hsa_agent_t agent; + /* + Virtual address involved. + */ + uint64_t virtual_address; + /* + Bit field encoding the memory error failure reasons. There could be multiple bits set + for one error. Bits are defined in hsa_amd_memory_error_reason_t. + */ + uint32_t error_reason_mask; +} hsa_amd_gpu_memory_error_info_t; + +/** + * @brief Flags denoting the type of a HW exception + */ +typedef enum { + // Unused for now + HSA_AMD_HW_EXCEPTION_RESET_TYPE_OTHER = 1 << 0, +} hsa_amd_hw_exception_reset_type_t; + +/** + * @brief Flags denoting the cause of a HW exception + */ +typedef enum { + // GPU Hang + HSA_AMD_HW_EXCEPTION_CAUSE_GPU_HANG = 1 << 0, + // SRAM ECC + HSA_AMD_HW_EXCEPTION_CAUSE_ECC = 1 << 1, +} hsa_amd_hw_exception_reset_cause_t; + +/** + * @brief AMD GPU HW Exception event data. + */ +typedef struct hsa_amd_gpu_hw_exception_info_s { + /* + The agent where the HW exception occurred. + */ + hsa_agent_t agent; + hsa_amd_hw_exception_reset_type_t reset_type; + hsa_amd_hw_exception_reset_cause_t reset_cause; +} hsa_amd_gpu_hw_exception_info_t; + +/** + * @brief AMD GPU event data passed to event handler. + */ +typedef struct hsa_amd_event_s { + /* + The event type. + */ + hsa_amd_event_type_t event_type; + union { + /* + The memory fault info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_FAULT_EVENT. + */ + hsa_amd_gpu_memory_fault_info_t memory_fault; + /* + The memory fault info, only valid when @p event_type is HSA_AMD_GPU_HW_EXCEPTION_EVENT. + */ + hsa_amd_gpu_hw_exception_info_t hw_exception; + /* + The memory error info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_ERROR_EVENT. + */ + hsa_amd_gpu_memory_error_info_t memory_error; + }; +} hsa_amd_event_t; + +typedef hsa_status_t (*hsa_amd_system_event_callback_t)(const hsa_amd_event_t* event, void* data); + +/** + * @brief Register AMD GPU event handler. + * + * @param[in] callback Callback to be invoked when an event is triggered. + * The HSA runtime passes two arguments to the callback: @p event + * is defined per event by the HSA runtime, and @p data is the user data. + * + * @param[in] data User data that is passed to @p callback. May be NULL. + * + * @retval HSA_STATUS_SUCCESS The handler has been registered successfully. + * + * @retval HSA_STATUS_ERROR An event handler has already been registered. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p event is invalid. + */ +hsa_status_t HSA_API hsa_amd_register_system_event_handler(hsa_amd_system_event_callback_t callback, + void* data); + +/** @} */ + +/** \addtogroup queue Queues + * @{ + */ + +/** + * @brief Per-queue dispatch and wavefront scheduling priority. + */ +typedef enum hsa_amd_queue_priority_s { + /* + Below normal/high priority compute and all graphics + */ + HSA_AMD_QUEUE_PRIORITY_LOW = 0, + /* + Above low priority compute, below high priority compute and all graphics + */ + HSA_AMD_QUEUE_PRIORITY_NORMAL = 1, + /* + Above low/normal priority compute and all graphics + */ + HSA_AMD_QUEUE_PRIORITY_HIGH = 2, +} hsa_amd_queue_priority_t; + +/** + * @brief Modifies the dispatch and wavefront scheduling prioirty for a + * given compute queue. The default is HSA_AMD_QUEUE_PRIORITY_NORMAL. + * + * @param[in] queue Compute queue to apply new priority to. + * + * @param[in] priority Priority to associate with queue. + * + * @retval HSA_STATUS_SUCCESS if priority was changed successfully. + * + * @retval HSA_STATUS_ERROR_INVALID_QUEUE if queue is not a valid + * compute queue handle. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if priority is not a valid + * value from hsa_amd_queue_priority_t. + */ +hsa_status_t HSA_API hsa_amd_queue_set_priority(hsa_queue_t* queue, + hsa_amd_queue_priority_t priority); + +/** + * @brief Queue creation attributes. + */ +typedef enum { + /** + * The queue's packet buffer and queue descriptor struct should be + * allocated in system memory (default). Mutually exclusive with + * HSA_AMD_QUEUE_CREATE_DEVICE_MEM_RING_BUF and + * HSA_AMD_QUEUE_CREATE_DEVICE_MEM_QUEUE_DESCRIPTOR. + */ + HSA_AMD_QUEUE_CREATE_SYSTEM_MEM = 0, + /** + * The queue's packet buffer should be allocated in the agent's + * fine-grain device memory region. + */ + HSA_AMD_QUEUE_CREATE_DEVICE_MEM_RING_BUF = (1 << 0), + /** + * The queue desciptor struct should be allocated in the agent's + * fine-grain device memory region. Not supported for devices + * connected via PCIe because the CPU's atomic read-modify-write + * operations cannot be promoted to PCIe atomic read-modify-write + * operations. + */ + HSA_AMD_QUEUE_CREATE_DEVICE_MEM_QUEUE_DESCRIPTOR = (1 << 1), +} hsa_amd_queue_create_flag_t; + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Deallocation notifier function type. + */ +typedef void (*hsa_amd_deallocation_callback_t)(void* ptr, void* user_data); + +/** + * @brief Registers a deallocation notifier monitoring for release of agent + * accessible address @p ptr. If successful, @p callback will be invoked when + * @p ptr is removed from accessibility from all agents. + * + * Notification callbacks are automatically deregistered when they are invoked. + * + * Note: The current version supports notifications of address release + * originating from ::hsa_amd_memory_pool_free. Support for other address + * release APIs will follow. + * + * @param[in] ptr Agent accessible address to monitor for deallocation. Passed + * to @p callback. + * + * @param[in] callback Notifier to be invoked when @p ptr is released from + * agent accessibility. + * + * @param[in] user_data User provided value passed to @p callback. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The notifier registered successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p ptr does not refer to a valid agent accessible + * address. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL or @p ptr is NULL. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + */ +hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr, + hsa_amd_deallocation_callback_t callback, + void* user_data); + +/** + * @brief Removes a deallocation notifier previously registered with + * ::hsa_amd_register_deallocation_callback. Arguments must be identical to + * those given in ::hsa_amd_register_deallocation_callback. + * + * @param[in] ptr Agent accessible address which was monitored for deallocation. + * + * @param[in] callback Notifier to be removed. + * + * @retval ::HSA_STATUS_SUCCESS The notifier has been removed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The given notifier was not registered. + */ +hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr, + hsa_amd_deallocation_callback_t callback); + +typedef enum hsa_amd_svm_model_s { + /** + * Updates to memory with this attribute conform to HSA memory consistency + * model. + */ + HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0, + /** + * Writes to memory with this attribute can be performed by a single agent + * at a time. + */ + HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1, + /** + * Memory region queried contains subregions with both + * HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED and + * HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED attributes. + * + * This attribute can not be used in hsa_amd_svm_attributes_set. It is a + * possible return from hsa_amd_svm_attributes_get indicating that the query + * region contains both coarse and fine grained memory. + */ + HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE = 2 +} hsa_amd_svm_model_t; + +typedef enum hsa_amd_svm_attribute_s { + // Memory model attribute. + // Type of this attribute is hsa_amd_svm_model_t. + HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0, + // Marks the range read only. This allows multiple physical copies to be + // placed local to each accessing device. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_ONLY = 1, + // Automatic migrations should attempt to keep the memory within the xgmi hive + // containing accessible agents. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2, + // Page granularity to migrate at once. Page granularity is specified as + // log2(page_count). + // Type of this attribute is uint64_t. + HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3, + // Physical location to prefer when automatic migration occurs. + // Set to the null agent handle (handle == 0) to indicate there + // is no preferred location. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4, + // This attribute can not be used in ::hsa_amd_svm_attributes_set (see + // ::hsa_amd_svm_prefetch_async). + // Queries the physical location of most recent prefetch command. + // If the prefetch location has not been set or is not uniform across the + // address range then returned hsa_agent_t::handle will be 0. + // Querying this attribute will return the destination agent of the most + // recent ::hsa_amd_svm_prefetch_async targeting the address range. If + // multiple async prefetches have been issued targeting the region and the + // most recently issued prefetch has completed then the query will return + // the location of the most recently completed prefetch. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5, + // Optimizes with the anticipation that the majority of operations to the + // range will be read operations. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_MOSTLY = 6, + // Allows the execution on GPU. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_GPU_EXEC = 7, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range. Access may incur a page fault + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM + // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range without page faults. Access + // will not incur a page fault and will not cause access based migration. + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if + // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Denies an agent access to the memory range. Access will cause a terminal + // segfault. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202, + // This attribute can not be used in ::hsa_amd_svm_attributes_set. + // Returns the access attribute associated with the agent. + // The agent to query must be set in the attribute value field. + // The attribute enum will be replaced with the agent's current access + // attribute for the address range. + // TODO: Clarify KFD return value for non-uniform access attribute. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203, +} hsa_amd_svm_attribute_t; + +// List type for hsa_amd_svm_attributes_set/get. +typedef struct hsa_amd_svm_attribute_pair_s { + // hsa_amd_svm_attribute_t value. + uint64_t attribute; + // Attribute value. Bit values should be interpreted according to the type + // given in the associated attribute description. + uint64_t value; +} hsa_amd_svm_attribute_pair_t; + +/** + * @brief Sets SVM memory attributes. + * + * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling + * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE + * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM + * memory access by that Agent. + * + * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION + * may not be used with this API. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Gets SVM memory attributes. + * + * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE, + * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and + * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API. + * + * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an + * hsa_agent_t and returns the current access type through its attribute field. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Asynchronously migrates memory to an agent. + * + * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero. + * @p completion_signal will decrement when the migration is complete. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] agent Agent to migrate to. + * + * @param[in] num_dep_signals Number of dependent signals. Can be 0. + * + * @param[in] dep_signals List of signals that must be waited on before the migration + * operation starts. The migration will start after every signal has been observed with + * the value 0. If @p num_dep_signals is 0, this argument is ignored. + * + * @param[in] completion_signal Signal used to indicate completion of the migration + * operation. When the migration operation is finished, the value of the signal is + * decremented. The runtime indicates that an error has occurred during the copy + * operation by setting the value of the completion signal to a negative + * number. If no completion signal is required this handle may be null. + */ +hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + +/** @} */ + +/** \addtogroup profile Profiling + * @{ + */ + +/** + * @brief Acquire Stream Performance Monitor on an agent + * + * Acquire exclusive use of SPM on @p preferred_agent. + * See hsa_amd_spm_set_dest_buffer to provide a destination buffer to KFD to start recording and + * retrieve this data. + * @param[in] preferred_agent Agent on which to acquire SPM + */ +hsa_status_t hsa_amd_spm_acquire(hsa_agent_t preferred_agent); + +/** + * @brief Release Stream Performance Monitor on an agent + * + * Release exclusive use of SPM on @p preferred_agent. This will stop KFD writing SPM data. + * If a destination buffer is set, then data in the destination buffer is available to user + * when this function returns. + * + * @param[in] preferred_agent Agent on which to release SPM + */ +hsa_status_t hsa_amd_spm_release(hsa_agent_t preferred_agent); + +/** + * @brief Set up the current destination user mode buffer for stream performance + * counter data. KFD will start writing SPM data into the destination buffer. KFD will continue + * to copy data into the current destination buffer until any of the following functions are called + * - hsa_amd_spm_release + * - hsa_amd_spm_set_dest_buffer with dest set to NULL + * - hsa_amd_spm_set_dest_buffer with dest set to a new buffer + * + * if @p timeout is non-0, the call will wait for up to @p timeout ms for the previous + * buffer to be filled. If previous buffer to be filled before timeout, the @p timeout + * will be updated value with the time remaining. If the timeout is exceeded, the function + * copies any partial data available into the previous user buffer and returns success. + * User should not access destination data while KFD is copying data. + * If the previous destination buffer was full, then @p is_data_loss flag is set. + * @p dest is CPU accessible memory. It could be malloc'ed memory or host allocated memory + * + * @param[in] preferred_agent Agent on which to set the dest buffer + * + * @param[in] size_in_bytes size of the buffer + * + * @param[in,out] timeout timeout in milliseconds + * + * @param[out] size_copied number of bytes copied + * + * @param[in] dest destination address. Set to NULL to stop copy on previous buffer + * + * @param[out] is_data_loss true is data was lost + */ +hsa_status_t hsa_amd_spm_set_dest_buffer(hsa_agent_t preferred_agent, size_t size_in_bytes, + uint32_t* timeout, uint32_t* size_copied, void* dest, + bool* is_data_loss); + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Older version of export dmabuf + * + * This is the same as calling the v2 version of export dmabuf with the + * flags argument set to HSA_AMD_DMABUF_MAPPING_TYPE_NONE. + * + * @param[in] ptr Pointer to the allocation being exported. + * + * @param[in] size Size in bytes to export following @p ptr. The entire range + * being exported must be contained within a single allocation. + * + * @param[out] dmabuf Pointer to a dma-buf file descriptor holding a reference to the + * allocation. Contents will not be altered in the event of failure. + * + * @param[out] offset Offset in bytes into the memory referenced by the dma-buf + * object at which @p ptr resides. Contents will not be altered in the event + * of failure. + * + * @retval ::HSA_STATUS_SUCCESS Export completed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT One or more arguments is NULL. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The address range described by + * @p ptr and @p size are not contained within a single allocation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The allocation described by @p ptr + * and @p size was allocated on a device which can not export memory. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The return file descriptor, + * @p dmabuf, could not be created. + */ +hsa_status_t hsa_amd_portable_export_dmabuf(const void* ptr, size_t size, int* dmabuf, + uint64_t* offset); + + /** + * @brief Obtains an OS specific, vendor neutral, handle to a memory allocation. + * + * Obtains an OS specific handle to GPU agent memory. The memory must be part + * of a single allocation from an hsa_amd_memory_pool_t exposed by a GPU Agent. + * The handle may be used with other APIs (e.g. Vulkan) to obtain shared access + * to the allocation. + * + * Shared access to the memory is not guaranteed to be fine grain coherent even + * if the allocation exported is from a fine grain pool. The shared memory + * consistency model will be no stronger than the model exported from, consult + * the importing API to determine the final consistency model. + * + * The allocation's memory remains valid as long as the handle and any mapping + * of the handle remains valid. When the handle and all mappings are closed + * the backing memory will be released for reuse. + * + * @param[in] ptr Pointer to the allocation being exported. + * + * @param[in] size Size in bytes to export following @p ptr. The entire range + * being exported must be contained within a single allocation. + * + * @param[out] dmabuf Pointer to a dma-buf file descriptor holding a reference to the + * allocation. Contents will not be altered in the event of failure. + * + * @param[out] offset Offset in bytes into the memory referenced by the dma-buf + * object at which @p ptr resides. Contents will not be altered in the event + * of failure. + * + * @param[in] flags Bitmask of hsa_amd_dma_buf_mapping_type_t flags. + * + * @retval ::HSA_STATUS_SUCCESS Export completed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT One or more arguments is NULL. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The address range described by + * @p ptr and @p size are not contained within a single allocation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The allocation described by @p ptr + * and @p size was allocated on a device which can not export memory. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The return file descriptor, + * @p dmabuf, could not be created. + */ +hsa_status_t hsa_amd_portable_export_dmabuf_v2(const void* ptr, size_t size, + int* dmabuf, uint64_t* offset, uint64_t flags); + +/** + * @brief Closes an OS specific, vendor neutral, handle to a memory allocation. + * + * Closes an OS specific handle to GPU agent memory. + * + * Applications should close a handle after imports are complete. The handle + * is not required to remain open for the lifetime of imported mappings. The + * referenced allocation will remain valid until all handles and mappings + * are closed. + * + * @param[in] dmabuf Handle to be closed. + * + * @retval ::HSA_STATUS_SUCCESS Handle closed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_RESOURCE_FREE A generic error was encountered + * when closing the handle. The handle may have been closed already or an + * async IO error may have occured. + */ +hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf); + +typedef enum hsa_amd_vmem_address_reserve_flag_s { + // Only reserve a VA range without registering it to the underlying driver + HSA_AMD_VMEM_ADDRESS_NO_REGISTER = (1UL << 0), +} hsa_amd_vmem_address_reserve_flag_t; + +/** + * @brief Allocate a reserved address range + * + * Reserve a virtual address range. The size must be a multiple of the system page size. + * If it is not possible to allocate the address specified by @p address, then @p va will be + * a different address range. + * Address range should be released by calling hsa_amd_vmem_address_free. + * + * @param[out] va virtual address allocated + * @param[in] size of address range requested + * @param[in] address requested + * @param[in] flags optional hsa_amd_vmem_address_reserve_flag_t + * + * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address + * range of this size. + * + * Note that this API will be deprecated in a future release and replaced by + * hsa_amd_vmem_address_reserve_align + */ +hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t address, + uint64_t flags); + +/** + * @brief Allocate a reserved address range + * + * Reserve a virtual address range. The size must be a multiple of the system page size. + * If it is not possible to allocate the address specified by @p address, then @p va will be + * a different address range. + * Address range should be released by calling hsa_amd_vmem_address_free. + * + * @param[out] va virtual address allocated + * @param[in] size of address range requested + * @param[in] address requested + * @param[in] alignment requested. 0 for default. Must be >= page-size and a power of 2 + * @param[in] flags optional hsa_amd_vmem_address_reserve_flag_t + * + * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address + * range of this size. + */ +hsa_status_t hsa_amd_vmem_address_reserve_align(void** va, size_t size, uint64_t address, + uint64_t alignment, uint64_t flags); + +/** + * @brief Free a reserved address range + * + * Free a previously allocated address range. The size must match the size of a previously + * allocated address range. + * + * @param[out] va virtual address to be freed + * @param[in] size of address range + * + * @retval ::HSA_STATUS_SUCCESS Address range released successfully + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid va specified + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid size specified + * @retval ::HSA_STATUS_ERROR_RESOURCE_FREE Address range is still in use + * @retval ::HSA_STATUS_ERROR Internal unexpected error + */ +hsa_status_t hsa_amd_vmem_address_free(void* va, size_t size); + +/** + * @brief Struct containing an opaque handle to a memory allocation handle + */ +typedef struct hsa_amd_vmem_alloc_handle_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_amd_vmem_alloc_handle_t; + +typedef enum { + MEMORY_TYPE_NONE, + MEMORY_TYPE_PINNED, +} hsa_amd_memory_type_t; + +/** + * @brief Create a virtual memory handle + * + * Create a virtual memory handle within this pool + * @p size must be a aligned to allocation granule size for this memory pool, see + * HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE + * To minimize internal memory fragmentation, align the size to the recommended allocation granule + * size, see HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE + * + * @param[in] pool memory to use + * @param[in] size of the memory allocation + * @param[in] type of memory + * @param[in] flags - currently unsupported + * @param[out] memory_handle - handle for the allocation + * + * @retval ::HSA_STATUS_SUCCESS memory allocated successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid arguments + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION This memory pool does not support allocations + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate this memory + */ +hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size, + hsa_amd_memory_type_t type, uint64_t flags, + hsa_amd_vmem_alloc_handle_t* memory_handle); + +/** + * @brief Release a virtual memory handle + * + * @param[in] memory handle that was previously allocated + * + * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle + */ +hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_handle); + +/** + * @brief Map a virtual memory handle + * + * Map a virtual memory handle to a reserved address range. The virtual address requested must be + * within a previously reserved address range. @p va and (@p va + size) must be must be within + * (va + size) of the previous allocated address range. + * @p size must be equal to size of the @p memory_handle + * hsa_amd_vmem_set_access needs to be called to make the memory accessible to specific agents + * + * @param[in] va virtual address range where memory will be mapped + * @param[in] size of memory mapping + * @param[in] in_offset offset into memory. Currently unsupported + * @param[in] memory_handle virtual memory handle to be mapped + * @param[in] flags. Currently unsupported + * + * @retval ::HSA_STATUS_SUCCESS Memory mapped successfully + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT va, size or memory_handle are invalid + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_map(void* va, size_t size, size_t in_offset, + hsa_amd_vmem_alloc_handle_t memory_handle, uint64_t flags); + +/** + * @brief Unmap a virtual memory handle + * + * Unmap previously mapped virtual address range + * + * @param[in] va virtual address range where memory will be mapped + * @param[in] size of memory mapping + * + * @retval ::HSA_STATUS_SUCCESS Memory backing unmapped successfully + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION memory_handle is invalid + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT size is invalid + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_unmap(void* va, size_t size); + +typedef struct hsa_amd_memory_access_desc_s { + hsa_access_permission_t permissions; + hsa_agent_t agent_handle; +} hsa_amd_memory_access_desc_t; + +/** + * @brief Make a memory mapping accessible + * + * Make previously mapped virtual address accessible to specific agents. @p size must be equal to + * size of previously mapped virtual memory handle. + * Calling hsa_amd_vmem_set_access multiple times on the same @p va: + * - Will overwrite permissions for agents specified in @p desc + * - Will leave permissions unchanged for agents not specified in @p desc + * + * @param[in] va previously mapped virtual address + * @param[in] size of memory mapping + * @param[in] desc list of access permissions for each agent + * @param[in] desc_cnt number of elements in desc + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT va, size or memory_handle are invalid + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION memory_handle is invalid + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT Invalid agent in desc + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size, + const hsa_amd_memory_access_desc_t* desc, + size_t desc_cnt); + +/** + * @brief Get current access permissions for memory mapping + * + * Get access permissions for memory mapping for specific agent. + * + * @param[in] va previously mapped virtual address + * @param[in] perms current permissions + * @param[in] agent_handle agent + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT Invalid agent + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION va is not mapped or permissions never set for this + * agent + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms, + hsa_agent_t agent_handle); + +/** + * @brief Get an exportable shareable handle + * + * Get an exportable shareable handle for a memory_handle. This shareabl handle can then be used to + * re-create a virtual memory handle using hsa_amd_vmem_import_shareable_handle. The shareable + * handle can be transferred using mechanisms that support posix file descriptors Once all shareable + * handles are closed, the memory_handle is released. + * + * @param[out] dmabuf_fd shareable handle + * @param[in] handle previously allocated virtual memory handle + * @param[in] flags Currently unsupported + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Out of resources + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd, + hsa_amd_vmem_alloc_handle_t handle, + uint64_t flags); +/** + * @brief Import a shareable handle + * + * Import a shareable handle for a memory handle. Importing a shareable handle that has been closed + * and released results in undefined behavior. + * + * @param[in] dmabuf_fd shareable handle exported with hsa_amd_vmem_export_shareable_handle + * @param[out] handle virtual memory handle + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Out of resources + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd, + hsa_amd_vmem_alloc_handle_t* handle); + +/** + * @brief Returns memory handle for mapped memory + * + * Return a memory handle for previously mapped memory. The handle will be the same value of handle + * used to map the memory. The returned handle must be released with corresponding number of calls + * to hsa_amd_vmem_handle_release. + * + * @param[out] memory_handle memory handle for this mapped address + * @param[in] mapped address + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid address + */ +hsa_status_t hsa_amd_vmem_retain_alloc_handle(hsa_amd_vmem_alloc_handle_t* memory_handle, + void* addr); + +/** + * @brief Returns the current allocation properties of a handle + * + * Returns the allocation properties of an existing handle + * + * @param[in] memory_handle memory handle to be queried + * @param[out] pool memory pool that owns this handle + * @param[out] memory type + + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle + */ +hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle( + hsa_amd_vmem_alloc_handle_t memory_handle, hsa_amd_memory_pool_t* pool, + hsa_amd_memory_type_t* type); + +/** @} */ + +/** \addtogroup queue Queues + * @{ + */ + +/** + * @brief Set the asynchronous scratch limit threshold on all the queues for this agent. + * Dispatches that are enqueued on HW queues on this agent that are smaller than threshold will not + * result in a scratch use-once method. + * + * Increasing this threshold will only increase the internal limit and not cause immediate allocation + * of additional scratch memory. Decreasing this threshold will result in a release in scratch memory + * on queues where the current amount of allocated scratch exceeds the new limit. + * + * If this API call would result in a release in scratch memory and there are dispatches that are + * currently using scratch memory on this agent, this will result into a blocking call until the + * current dispatches are completed. + * + * This API is only supported on devices that support asynchronous scratch reclaim. + * + * @param[in] agent A valid agent. + * + * @param[in] threshold Threshold size in bytes + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT This agent does not support asynchronous scratch + * reclaim + */ +hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, size_t threshold); + +typedef enum { + /* + * Returns the agent that owns the underlying HW queue. + * The type of this attribute is hsa_agent_t. + */ + HSA_AMD_QUEUE_INFO_AGENT, + /* + * Returns the doorbell ID of the completion signal of the queue + * The type of this attribute is uint64_t. + */ + HSA_AMD_QUEUE_INFO_DOORBELL_ID, +} hsa_queue_info_attribute_t; + +hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute, + void* value); + +/** + * @brief logging types + */ +typedef enum hsa_amd_log_flag_s { + /* Log AQL packets internally enqueued by HSA for Blit Kernels */ + HSA_AMD_LOG_FLAG_BLIT_KERNEL_PKTS = 0, +} hsa_amd_log_flag_t; + +/** + * @brief Enable logging via external file + * If this function is called multiple times, the last call to this function will overwrite the + * previous @p flags and @p file. + * + * @param[in] flags is used to filter types of logging. Type is uint8_t[8]. + * Can be set using the hsa_flag_set64 macro. Setting @p flags to 0 will disable logging. + * @param[in] file file stream to output logging. If file is NULL, prints are sent to stderr. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + */ +hsa_status_t hsa_amd_enable_logging(uint8_t* flags, void* file); + +/** @} */ + +#ifdef __cplusplus +} // end extern "C" block +#endif + +#endif // header guard diff --git a/hsa/hsa_ext_finalize.h b/hsa/hsa_ext_finalize.h new file mode 100644 index 0000000000..94c4582055 --- /dev/null +++ b/hsa/hsa_ext_finalize.h @@ -0,0 +1,531 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ +#define HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ + +#include "hsa.h" + +#undef HSA_API +#ifdef HSA_EXPORT_FINALIZER +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +struct BrigModuleHeader; +typedef struct BrigModuleHeader* BrigModule_t; + +/** \defgroup ext-alt-finalizer-extensions Finalization Extensions + * @{ + */ + +/** + * @brief Enumeration constants added to ::hsa_status_t by this extension. + */ +enum { + /** + * The HSAIL program is invalid. + */ + HSA_EXT_STATUS_ERROR_INVALID_PROGRAM = 0x2000, + /** + * The HSAIL module is invalid. + */ + HSA_EXT_STATUS_ERROR_INVALID_MODULE = 0x2001, + /** + * Machine model or profile of the HSAIL module do not match the machine model + * or profile of the HSAIL program. + */ + HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE = 0x2002, + /** + * The HSAIL module is already a part of the HSAIL program. + */ + HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED = 0x2003, + /** + * Compatibility mismatch between symbol declaration and symbol definition. + */ + HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH = 0x2004, + /** + * The finalization encountered an error while finalizing a kernel or + * indirect function. + */ + HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED = 0x2005, + /** + * Mismatch between a directive in the control directive structure and in + * the HSAIL kernel. + */ + HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH = 0x2006 +}; + +/** @} */ + +/** \defgroup ext-alt-finalizer-program Finalization Program + * @{ + */ + +/** + * @brief HSAIL (BRIG) module. The HSA Programmer's Reference Manual contains + * the definition of the BrigModule_t type. + */ +typedef BrigModule_t hsa_ext_module_t; + +/** + * @brief An opaque handle to a HSAIL program, which groups a set of HSAIL + * modules that collectively define functions and variables used by kernels and + * indirect functions. + */ +typedef struct hsa_ext_program_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_ext_program_t; + +/** + * @brief Create an empty HSAIL program. + * + * @param[in] machine_model Machine model used in the HSAIL program. + * + * @param[in] profile Profile used in the HSAIL program. + * + * @param[in] default_float_rounding_mode Default float rounding mode used in + * the HSAIL program. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[out] program Memory location where the HSA runtime stores the newly + * created HSAIL program handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p machine_model is invalid, + * @p profile is invalid, @p default_float_rounding_mode is invalid, or + * @p program is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_create( + hsa_machine_model_t machine_model, + hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, + hsa_ext_program_t *program); + +/** + * @brief Destroy a HSAIL program. + * + * @details The HSAIL program handle becomes invalid after it has been + * destroyed. Code object handles produced by ::hsa_ext_program_finalize are + * still valid after the HSAIL program has been destroyed, and can be used as + * intended. Resources allocated outside and associated with the HSAIL program + * (such as HSAIL modules that are added to the HSAIL program) can be released + * after the finalization program has been destroyed. + * + * @param[in] program HSAIL program. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is + * invalid. + */ +hsa_status_t HSA_API hsa_ext_program_destroy( + hsa_ext_program_t program); + +/** + * @brief Add a HSAIL module to an existing HSAIL program. + * + * @details The HSA runtime does not perform a deep copy of the HSAIL module + * upon addition. Instead, it stores a pointer to the HSAIL module. The + * ownership of the HSAIL module belongs to the application, which must ensure + * that @p module is not released before destroying the HSAIL program. + * + * The HSAIL module is successfully added to the HSAIL program if @p module is + * valid, if all the declarations and definitions for the same symbol are + * compatible, and if @p module specify machine model and profile that matches + * the HSAIL program. + * + * @param[in] program HSAIL program. + * + * @param[in] module HSAIL module. The application can add the same HSAIL module + * to @p program at most once. The HSAIL module must specify the same machine + * model and profile as @p program. If the floating-mode rounding mode of @p + * module is not default, then it should match that of @p program. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_MODULE The HSAIL module is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE The machine model of @p + * module does not match machine model of @p program, or the profile of @p + * module does not match profile of @p program. + * + * @retval ::HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED The HSAIL module is + * already a part of the HSAIL program. + * + * @retval ::HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH Symbol declaration and symbol + * definition compatibility mismatch. See the symbol compatibility rules in the + * HSA Programming Reference Manual. + */ +hsa_status_t HSA_API hsa_ext_program_add_module( + hsa_ext_program_t program, + hsa_ext_module_t module); + +/** + * @brief Iterate over the HSAIL modules in a program, and invoke an + * application-defined callback on every iteration. + * + * @param[in] program HSAIL program. + * + * @param[in] callback Callback to be invoked once per HSAIL module in the + * program. The HSA runtime passes three arguments to the callback: the program, + * a HSAIL module, and the application data. If @p callback returns a status + * other than ::HSA_STATUS_SUCCESS for a particular iteration, the traversal + * stops and ::hsa_ext_program_iterate_modules returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The program is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_iterate_modules( + hsa_ext_program_t program, + hsa_status_t (*callback)(hsa_ext_program_t program, hsa_ext_module_t module, + void* data), + void* data); + +/** + * @brief HSAIL program attributes. + */ +typedef enum { + /** + * Machine model specified when the HSAIL program was created. The type + * of this attribute is ::hsa_machine_model_t. + */ + HSA_EXT_PROGRAM_INFO_MACHINE_MODEL = 0, + /** + * Profile specified when the HSAIL program was created. The type of + * this attribute is ::hsa_profile_t. + */ + HSA_EXT_PROGRAM_INFO_PROFILE = 1, + /** + * Default float rounding mode specified when the HSAIL program was + * created. The type of this attribute is ::hsa_default_float_rounding_mode_t. + */ + HSA_EXT_PROGRAM_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 2 +} hsa_ext_program_info_t; + +/** + * @brief Get the current value of an attribute for a given HSAIL program. + * + * @param[in] program HSAIL program. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behaviour is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * HSAIL program attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_get_info( + hsa_ext_program_t program, + hsa_ext_program_info_t attribute, + void *value); + +/** + * @brief Finalizer-determined call convention. + */ +typedef enum { + /** + * Finalizer-determined call convention. + */ + HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO = -1 +} hsa_ext_finalizer_call_convention_t; + +/** + * @brief Control directives specify low-level information about the + * finalization process. + */ +typedef struct hsa_ext_control_directives_s { + /** + * Bitset indicating which control directives are enabled. The bit assigned to + * a control directive is determined by the corresponding value in + * BrigControlDirective. + * + * If a control directive is disabled, its corresponding field value (if any) + * must be 0. Control directives that are only present or absent (such as + * partial workgroups) have no corresponding field as the presence of the bit + * in this mask is sufficient. + */ + uint64_t control_directives_mask; + /** + * Bitset of HSAIL exceptions that must have the BREAK policy enabled. The bit + * assigned to an HSAIL exception is determined by the corresponding value + * in BrigExceptionsMask. If the kernel contains a enablebreakexceptions + * control directive, the finalizer uses the union of the two masks. + */ + uint16_t break_exceptions_mask; + /** + * Bitset of HSAIL exceptions that must have the DETECT policy enabled. The + * bit assigned to an HSAIL exception is determined by the corresponding value + * in BrigExceptionsMask. If the kernel contains a enabledetectexceptions + * control directive, the finalizer uses the union of the two masks. + */ + uint16_t detect_exceptions_mask; + /** + * Maximum size (in bytes) of dynamic group memory that will be allocated by + * the application for any dispatch of the kernel. If the kernel contains a + * maxdynamicsize control directive, the two values should match. + */ + uint32_t max_dynamic_group_size; + /** + * Maximum number of grid work-items that will be used by the application to + * launch the kernel. If the kernel contains a maxflatgridsize control + * directive, the value of @a max_flat_grid_size must not be greater than the + * value of the directive, and takes precedence. + * + * The value specified for maximum absolute grid size must be greater than or + * equal to the product of the values specified by @a required_grid_size. + * + * If the bit at position BRIG_CONTROL_MAXFLATGRIDSIZE is set in @a + * control_directives_mask, this field must be greater than 0. + */ + uint64_t max_flat_grid_size; + /** + * Maximum number of work-group work-items that will be used by the + * application to launch the kernel. If the kernel contains a + * maxflatworkgroupsize control directive, the value of @a + * max_flat_workgroup_size must not be greater than the value of the + * directive, and takes precedence. + * + * The value specified for maximum absolute grid size must be greater than or + * equal to the product of the values specified by @a required_workgroup_size. + * + * If the bit at position BRIG_CONTROL_MAXFLATWORKGROUPSIZE is set in @a + * control_directives_mask, this field must be greater than 0. + */ + uint32_t max_flat_workgroup_size; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + /** + * Grid size that will be used by the application in any dispatch of the + * kernel. If the kernel contains a requiredgridsize control directive, the + * dimensions should match. + * + * The specified grid size must be consistent with @a required_workgroup_size + * and @a required_dim. Also, the product of the three dimensions must not + * exceed @a max_flat_grid_size. Note that the listed invariants must hold + * only if all the corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDGRIDSIZE is set in @a + * control_directives_mask, the three dimension values must be greater than 0. + */ + uint64_t required_grid_size[3]; + /** + * Work-group size that will be used by the application in any dispatch of the + * kernel. If the kernel contains a requiredworkgroupsize control directive, + * the dimensions should match. + * + * The specified work-group size must be consistent with @a required_grid_size + * and @a required_dim. Also, the product of the three dimensions must not + * exceed @a max_flat_workgroup_size. Note that the listed invariants must + * hold only if all the corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDWORKGROUPSIZE is set in @a + * control_directives_mask, the three dimension values must be greater than 0. + */ + hsa_dim3_t required_workgroup_size; + /** + * Number of dimensions that will be used by the application to launch the + * kernel. If the kernel contains a requireddim control directive, the two + * values should match. + * + * The specified dimensions must be consistent with @a required_grid_size and + * @a required_workgroup_size. This invariant must hold only if all the + * corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDDIM is set in @a + * control_directives_mask, this field must be 1, 2, or 3. + */ + uint8_t required_dim; + /** + * Reserved. Must be 0. + */ + uint8_t reserved2[75]; +} hsa_ext_control_directives_t; + +/** + * @brief Finalize an HSAIL program for a given instruction set architecture. + * + * @details Finalize all of the kernels and indirect functions that belong to + * the same HSAIL program for a specific instruction set architecture (ISA). The + * transitive closure of all functions specified by call or scall must be + * defined. Kernels and indirect functions that are being finalized must be + * defined. Kernels and indirect functions that are referenced in kernels and + * indirect functions being finalized may or may not be defined, but must be + * declared. All the global/readonly segment variables that are referenced in + * kernels and indirect functions being finalized may or may not be defined, but + * must be declared. + * + * @param[in] program HSAIL program. + * + * @param[in] isa Instruction set architecture to finalize for. + * + * @param[in] call_convention A call convention used in a finalization. Must + * have a value between ::HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO (inclusive) + * and the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT in @p + * isa (not inclusive). + * + * @param[in] control_directives Low-level control directives that influence + * the finalization process. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[in] code_object_type Type of code object to produce. + * + * @param[out] code_object Code object generated by the Finalizer, which + * contains the machine code for the kernels and indirect functions in the HSAIL + * program. The code object is independent of the HSAIL module that was used to + * generate it. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p isa is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH The directive in + * the control directive structure and in the HSAIL kernel mismatch, or if the + * same directive is used with a different value in one of the functions used by + * this kernel. + * + * @retval ::HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED The Finalizer + * encountered an error while compiling a kernel or an indirect function. + */ +hsa_status_t HSA_API hsa_ext_program_finalize( + hsa_ext_program_t program, + hsa_isa_t isa, + int32_t call_convention, + hsa_ext_control_directives_t control_directives, + const char *options, + hsa_code_object_type_t code_object_type, + hsa_code_object_t *code_object); + +/** @} */ + +#define hsa_ext_finalizer_1_00 + +typedef struct hsa_ext_finalizer_1_00_pfn_s { + hsa_status_t (*hsa_ext_program_create)( + hsa_machine_model_t machine_model, hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, hsa_ext_program_t *program); + + hsa_status_t (*hsa_ext_program_destroy)(hsa_ext_program_t program); + + hsa_status_t (*hsa_ext_program_add_module)(hsa_ext_program_t program, + hsa_ext_module_t module); + + hsa_status_t (*hsa_ext_program_iterate_modules)( + hsa_ext_program_t program, + hsa_status_t (*callback)(hsa_ext_program_t program, + hsa_ext_module_t module, void *data), + void *data); + + hsa_status_t (*hsa_ext_program_get_info)( + hsa_ext_program_t program, hsa_ext_program_info_t attribute, + void *value); + + hsa_status_t (*hsa_ext_program_finalize)( + hsa_ext_program_t program, hsa_isa_t isa, int32_t call_convention, + hsa_ext_control_directives_t control_directives, const char *options, + hsa_code_object_type_t code_object_type, hsa_code_object_t *code_object); +} hsa_ext_finalizer_1_00_pfn_t; + +#ifdef __cplusplus +} // extern "C" block +#endif // __cplusplus + +#endif // HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ diff --git a/hsa/hsa_ext_image.h b/hsa/hsa_ext_image.h new file mode 100644 index 0000000000..cad9b50820 --- /dev/null +++ b/hsa/hsa_ext_image.h @@ -0,0 +1,1515 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_EXT_IMAGE_H +#define HSA_EXT_IMAGE_H + +#include "hsa.h" + +#undef HSA_API +#ifdef HSA_EXPORT_IMAGES +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +#ifdef __cplusplus +extern "C" { +#endif /*__cplusplus*/ + +/** \defgroup ext-images Images and Samplers + * @{ + */ + +/** + * @brief Enumeration constants added to ::hsa_status_t by this extension. + * + * @remark Additions to hsa_status_t + */ +enum { + /** + * Image format is not supported. + */ + HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED = 0x3000, + /** + * Image size is not supported. + */ + HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED = 0x3001, + /** + * Image pitch is not supported or invalid. + */ + HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED = 0x3002, + /** + * Sampler descriptor is not supported or invalid. + */ + HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED = 0x3003 +}; + +/** + * @brief Enumeration constants added to ::hsa_agent_info_t by this + * extension. + * + * @remark Additions to hsa_agent_info_t + */ +enum { + /** + * Maximum number of elements in 1D images. Must be at least 16384. The type + * of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS = 0x3000, + /** + * Maximum number of elements in 1DA images. Must be at least 16384. The type + * of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS = 0x3001, + /** + * Maximum number of elements in 1DB images. Must be at least 65536. The type + * of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS = 0x3002, + /** + * Maximum dimensions (width, height) of 2D images, in image elements. The X + * and Y maximums must be at least 16384. The type of this attribute is + * size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS = 0x3003, + /** + * Maximum dimensions (width, height) of 2DA images, in image elements. The X + * and Y maximums must be at least 16384. The type of this attribute is + * size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS = 0x3004, + /** + * Maximum dimensions (width, height) of 2DDEPTH images, in image + * elements. The X and Y maximums must be at least 16384. The type of this + * attribute is size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DDEPTH_MAX_ELEMENTS = 0x3005, + /** + * Maximum dimensions (width, height) of 2DADEPTH images, in image + * elements. The X and Y maximums must be at least 16384. The type of this + * attribute is size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS = 0x3006, + /** + * Maximum dimensions (width, height, depth) of 3D images, in image + * elements. The maximum along any dimension must be at least 2048. The type + * of this attribute is size_t[3]. + */ + HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS = 0x3007, + /** + * Maximum number of image layers in a image array. Must be at least 2048. The + * type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS = 0x3008, + /** + * Maximum number of read-only image handles that can be created for an agent at any one + * time. Must be at least 128. The type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES = 0x3009, + /** + * Maximum number of write-only and read-write image handles (combined) that + * can be created for an agent at any one time. Must be at least 64. The type of this + * attribute is size_t. + */ + HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES = 0x300A, + /** + * Maximum number of sampler handlers that can be created for an agent at any one + * time. Must be at least 16. The type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS = 0x300B, + /** + * Image pitch alignment. The agent only supports linear image data + * layouts with a row pitch that is a multiple of this value. Must be + * a power of 2. The type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_LINEAR_ROW_PITCH_ALIGNMENT = 0x300C +}; + +/** + * @brief Image handle, populated by ::hsa_ext_image_create or + * ::hsa_ext_image_create_with_layout. Image + * handles are only unique within an agent, not across agents. + * + */ +typedef struct hsa_ext_image_s { + /** + * Opaque handle. For a given agent, two handles reference the same object of + * the enclosing type if and only if they are equal. + */ + uint64_t handle; + +} hsa_ext_image_t; + +/** + * @brief Geometry associated with the image. This specifies the + * number of image dimensions and whether the image is an image + * array. See the Image Geometry section in the HSA + * Programming Reference Manual for definitions on each + * geometry. The enumeration values match the BRIG type @p + * hsa_ext_brig_image_geometry_t. + */ +typedef enum { +/** + * One-dimensional image addressed by width coordinate. + */ + HSA_EXT_IMAGE_GEOMETRY_1D = 0, + + /** + * Two-dimensional image addressed by width and height coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2D = 1, + + /** + * Three-dimensional image addressed by width, height, and depth coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_3D = 2, + + /** + * Array of one-dimensional images with the same size and format. 1D arrays + * are addressed by width and index coordinate. + */ + HSA_EXT_IMAGE_GEOMETRY_1DA = 3, + + /** + * Array of two-dimensional images with the same size and format. 2D arrays + * are addressed by width, height, and index coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DA = 4, + + /** + * One-dimensional image addressed by width coordinate. It has + * specific restrictions compared to ::HSA_EXT_IMAGE_GEOMETRY_1D. An + * image with an opaque image data layout will always use a linear + * image data layout, and one with an explicit image data layout + * must specify ::HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR. + */ + HSA_EXT_IMAGE_GEOMETRY_1DB = 5, + + /** + * Two-dimensional depth image addressed by width and height coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DDEPTH = 6, + + /** + * Array of two-dimensional depth images with the same size and format. 2D + * arrays are addressed by width, height, and index coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DADEPTH = 7 +} hsa_ext_image_geometry_t; + +/** + * @brief Channel type associated with the elements of an image. See + * the Channel Type section in the HSA Programming Reference + * Manual for definitions on each channel type. The + * enumeration values and definition match the BRIG type @p + * hsa_ext_brig_image_channel_type_t. + */ +typedef enum { + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0, + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14, + HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15 +} hsa_ext_image_channel_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_image_channel_type_t constants. + */ +typedef uint32_t hsa_ext_image_channel_type32_t; + +/** + * + * @brief Channel order associated with the elements of an image. See + * the Channel Order section in the HSA Programming Reference + * Manual for definitions on each channel order. The + * enumeration values match the BRIG type @p + * hsa_ext_brig_image_channel_order_t. + */ +typedef enum { + HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0, + HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1, + HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2, + HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4, + HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8, + HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9, + HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10, + HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14, + HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15, + HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16, + HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19 +} hsa_ext_image_channel_order_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_image_channel_order_t constants. + */ +typedef uint32_t hsa_ext_image_channel_order32_t; + + +/** + * @brief Image format. + */ +typedef struct hsa_ext_image_format_s { + /** + * Channel type. + */ + hsa_ext_image_channel_type32_t channel_type; + + /** + * Channel order. + */ + hsa_ext_image_channel_order32_t channel_order; +} hsa_ext_image_format_t; + +/** + * @brief Implementation independent image descriptor. + */ +typedef struct hsa_ext_image_descriptor_s { + /** + * Image geometry. + */ + hsa_ext_image_geometry_t geometry; + /** + * Width of the image, in components. + */ + size_t width; + /** + * Height of the image, in components. Only used if the geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_2D, ::HSA_EXT_IMAGE_GEOMETRY_3D, + * HSA_EXT_IMAGE_GEOMETRY_2DA, HSA_EXT_IMAGE_GEOMETRY_2DDEPTH, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0. + */ + size_t height; + /** + * Depth of the image, in components. Only used if the geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D, otherwise must be 0. + */ + size_t depth; + /** + * Number of image layers in the image array. Only used if the geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0. + */ + size_t array_size; + /** + * Image format. + */ + hsa_ext_image_format_t format; +} hsa_ext_image_descriptor_t; + +/** + * @brief Image capability. + */ +typedef enum { + /** + * Images of this geometry, format, and layout are not supported by + * the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED = 0x0, + /** + * Read-only images of this geometry, format, and layout are + * supported by the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_ONLY = 0x1, + /** + * Write-only images of this geometry, format, and layout are + * supported by the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_WRITE_ONLY = 0x2, + /** + * Read-write images of this geometry, format, and layout are + * supported by the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_WRITE = 0x4, + /** + * @deprecated Images of this geometry, format, and layout can be accessed from + * read-modify-write atomic operations in the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_MODIFY_WRITE = 0x8, + /** + * Images of this geometry, format, and layout are guaranteed to + * have a consistent data layout regardless of how they are + * accessed by the associated agent. + */ + HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT = 0x10 +} hsa_ext_image_capability_t; + +/** + * @brief Image data layout. + * + * @details An image data layout denotes such aspects of image data + * layout as tiling and organization of channels in memory. Some image + * data layouts may only apply to specific image geometries, formats, + * and access permissions. Different agents may support different + * image layout identifiers, including vendor specific layouts. Note + * that an agent may not support the same image data layout for + * different access permissions to images with the same image + * geometry, size, and format. If multiple agents support the same + * image data layout then it is possible to use separate image handles + * for each agent that references the same image data. + */ + +typedef enum { + /** + * An implementation specific opaque image data layout which can + * vary depending on the agent, geometry, image format, image size, + * and access permissions. + */ + HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE = 0x0, + /** + * The image data layout is specified by the following rules in + * ascending byte address order. For a 3D image, 2DA image array, + * or 1DA image array, the image data is stored as a linear sequence + * of adjacent 2D image slices, 2D images, or 1D images + * respectively, spaced according to the slice pitch. Each 2D image + * is stored as a linear sequence of adjacent image rows, spaced + * according to the row pitch. Each 1D or 1DB image is stored as a + * single image row. Each image row is stored as a linear sequence + * of image elements. Each image element is stored as a linear + * sequence of image components specified by the left to right + * channel order definition. Each image component is stored using + * the memory type specified by the channel type. + * + * The 1DB image geometry always uses the linear image data layout. + */ + HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR = 0x1 +} hsa_ext_image_data_layout_t; + +/** + * @brief Retrieve the supported image capabilities for a given combination of + * agent, geometry, and image format for an image created with an opaque image + * data layout. + * + * @param[in] agent Agent to be associated with the image handle. + * + * @param[in] geometry Geometry. + * + * @param[in] image_format Pointer to an image format. Must not be NULL. + * + * @param[out] capability_mask Pointer to a memory location where the HSA + * runtime stores a bit-mask of supported image capability + * (::hsa_ext_image_capability_t) values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is + * NULL, or @p capability_mask is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_get_capability( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + +/** + * @brief Retrieve the supported image capabilities for a given combination of + * agent, geometry, image format, and image layout for an image created with + * an explicit image data layout. + * + * @param[in] agent Agent to be associated with the image handle. + * + * @param[in] geometry Geometry. + * + * @param[in] image_format Pointer to an image format. Must not be NULL. + * + * @param[in] image_data_layout The image data layout. + * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use + * ::hsa_ext_image_get_capability instead. + * + * @param[out] capability_mask Pointer to a memory location where the HSA + * runtime stores a bit-mask of supported image capability + * (::hsa_ext_image_capability_t) values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is + * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE, + * or @p capability_mask is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_get_capability_with_layout( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + hsa_ext_image_data_layout_t image_data_layout, + uint32_t *capability_mask); + +/** + * @brief Agent specific image size and alignment requirements, populated by + * ::hsa_ext_image_data_get_info and ::hsa_ext_image_data_get_info_with_layout. + */ +typedef struct hsa_ext_image_data_info_s { + /** + * Image data size, in bytes. + */ + size_t size; + + /** + * Image data alignment, in bytes. Must always be a power of 2. + */ + size_t alignment; + +} hsa_ext_image_data_info_t; + +/** + * @brief Retrieve the image data requirements for a given combination of agent, image + * descriptor, and access permission for an image created with an opaque image + * data layout. + * + * @details The optimal image data size and alignment requirements may + * vary depending on the image attributes specified in @p + * image_descriptor, the @p access_permission, and the @p agent. Also, + * different implementations of the HSA runtime may return different + * requirements for the same input values. + * + * The implementation must return the same image data requirements for + * different access permissions with matching image descriptors as long + * as ::hsa_ext_image_get_capability reports + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image + * descriptors match if they have the same values, with the exception + * that s-form channel orders match the corresponding non-s-form + * channel order and vice versa. + * + * @param[in] agent Agent to be associated with the image handle. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] access_permission Access permission of the image when + * accessed by @p agent. The access permission defines how the agent + * is allowed to access the image and must match the corresponding + * HSAIL image handle type. The @p agent must support the image format + * specified in @p image_descriptor for the given @p + * access_permission. + * + * @param[out] image_data_info Memory location where the runtime stores the + * size and alignment requirements. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The @p + * agent does not support the image format specified by @p + * image_descriptor with the specified @p access_permission. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent + * does not support the image dimensions specified by @p + * image_descriptor with the specified @p access_permission. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * access_permission is not a valid access permission value, or @p + * image_data_info is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_data_get_info( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + +/** + * @brief Retrieve the image data requirements for a given combination of + * image descriptor, access permission, image data layout, image data row pitch, + * and image data slice pitch for an image created with an explicit image + * data layout. + * + * @details The image data size and alignment requirements may vary + * depending on the image attributes specified in @p image_descriptor, + * the @p access_permission, and the image layout. However, different + * implementations of the HSA runtime will return the same + * requirements for the same input values. + * + * The implementation must return the same image data requirements for + * different access permissions with matching image descriptors and + * matching image layouts as long as ::hsa_ext_image_get_capability + * reports + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image + * descriptors match if they have the same values, with the exception + * that s-form channel orders match the corresponding non-s-form + * channel order and vice versa. Image layouts match if they are the + * same image data layout and use the same image row and slice pitch + * values. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] access_permission Access permission of the image when + * accessed by an agent. The access permission defines how the agent + * is allowed to access the image and must match the corresponding + * HSAIL image handle type. + * + * @param[in] image_data_layout The image data layout to use. + * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use + * ::hsa_ext_image_data_get_info instead. + * + * @param[in] image_data_row_pitch The size in bytes for a single row + * of the image in the image data. If 0 is specified then the default + * row pitch value is used: image width * image element byte size. + * The value used must be greater than or equal to the default row + * pitch, and be a multiple of the image element byte size. For the + * linear image layout it must also be a multiple of the image linear + * row pitch alignment for the agents that will access the image data + * using image instructions. + * + * @param[in] image_data_slice_pitch The size in bytes of a single + * slice of a 3D image, or the size in bytes of each image layer in an + * image array in the image data. If 0 is specified then the default + * slice pitch value is used: row pitch * height if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must + * be 0 if the default slice pitch is 0, be greater than or equal to + * the default slice pitch, and be a multiple of the row pitch. + * + * @param[out] image_data_info Memory location where the runtime stores the + * size and alignment requirements. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The image + * format specified by @p image_descriptor is not supported for the + * @p access_permission and @p image_data_layout specified. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The image + * dimensions specified by @p image_descriptor are not supported for + * the @p access_permission and @p image_data_layout specified. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The row and + * slice pitch specified by @p image_data_row_pitch and @p + * image_data_slice_pitch are invalid or not supported. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is + * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE, + * or @p image_data_info is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_data_get_info_with_layout( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_data_info_t *image_data_info); + +/** + * @brief Creates an agent specific image handle to an image with an + * opaque image data layout. + * + * @details Images with an opaque image data layout created with + * different access permissions but matching image descriptors and + * same agent can share the same image data if + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported + * by ::hsa_ext_image_get_capability for the image format specified in + * the image descriptor. Image descriptors match if they have the same + * values, with the exception that s-form channel orders match the + * corresponding non-s-form channel order and vice versa. + * + * If necessary, an application can use image operations (import, + * export, copy, clear) to prepare the image for the intended use + * regardless of the access permissions. + * + * @param[in] agent agent to be associated with the image handle created. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] image_data Image data buffer that must have been allocated + * according to the size and alignment requirements dictated by + * ::hsa_ext_image_data_get_info. Must not be NULL. + * + * Any previous memory contents are preserved upon creation. The application is + * responsible for ensuring that the lifetime of the image data exceeds that of + * all the associated images. + * + * @param[in] access_permission Access permission of the image when + * accessed by agent. The access permission defines how the agent + * is allowed to access the image using the image handle created and + * must match the corresponding HSAIL image handle type. The agent + * must support the image format specified in @p image_descriptor for + * the given @p access_permission. + * + * @param[out] image Pointer to a memory location where the HSA runtime stores + * the newly created image handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent + * does not have the capability to support the image format contained + * in @p image_descriptor using the specified @p access_permission. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent + * does not support the image dimensions specified by @p + * image_descriptor using the specified @p access_permission. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * support the creation of more image handles with the given @p access_permission). + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * image_data is NULL, @p image_data does not have a valid alignment, + * @p access_permission is not a valid access permission + * value, or @p image is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_create( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + +/** + * @brief Creates an agent specific image handle to an image with an explicit + * image data layout. + * + * @details Images with an explicit image data layout created with + * different access permissions but matching image descriptors and + * matching image layout can share the same image data if + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported + * by ::hsa_ext_image_get_capability_with_layout for the image format + * specified in the image descriptor and specified image data + * layout. Image descriptors match if they have the same values, with + * the exception that s-form channel orders match the corresponding + * non-s-form channel order and vice versa. Image layouts match if + * they are the same image data layout and use the same image row and + * slice values. + * + * If necessary, an application can use image operations (import, export, copy, + * clear) to prepare the image for the intended use regardless of the access + * permissions. + * + * @param[in] agent agent to be associated with the image handle created. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] image_data Image data buffer that must have been allocated + * according to the size and alignment requirements dictated by + * ::hsa_ext_image_data_get_info_with_layout. Must not be NULL. + * + * Any previous memory contents are preserved upon creation. The application is + * responsible for ensuring that the lifetime of the image data exceeds that of + * all the associated images. + * + * @param[in] access_permission Access permission of the image when + * accessed by the agent. The access permission defines how the agent + * is allowed to access the image and must match the corresponding + * HSAIL image handle type. The agent must support the image format + * specified in @p image_descriptor for the given @p access_permission + * and @p image_data_layout. + * + * @param[in] image_data_layout The image data layout to use for the + * @p image_data. It is invalid to use + * ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use ::hsa_ext_image_create + * instead. + * + * @param[in] image_data_row_pitch The size in bytes for a single row + * of the image in the image data. If 0 is specified then the default + * row pitch value is used: image width * image element byte size. + * The value used must be greater than or equal to the default row + * pitch, and be a multiple of the image element byte size. For the + * linear image layout it must also be a multiple of the image linear + * row pitch alignment for the agents that will access the image data + * using image instructions. + * + * @param[in] image_data_slice_pitch The size in bytes of a single + * slice of a 3D image, or the size in bytes of each image layer in an + * image array in the image data. If 0 is specified then the default + * slice pitch value is used: row pitch * height if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must + * be 0 if the default slice pitch is 0, be greater than or equal to + * the default slice pitch, and be a multiple of the row pitch. + * + * @param[out] image Pointer to a memory location where the HSA runtime stores + * the newly created image handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent does + * not have the capability to support the image format contained in the image + * descriptor using the specified @p access_permission and @p image_data_layout. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent + * does not support the image dimensions specified by @p + * image_descriptor using the specified @p access_permission and @p + * image_data_layout. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The agent does + * not support the row and slice pitch specified by @p image_data_row_pitch + * and @p image_data_slice_pitch, or the values are invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * support the creation of more image handles with the given @p access_permission). + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * image_data is NULL, @p image_data does not have a valid alignment, + * @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE, + * or @p image is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_create_with_layout( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_t *image); + +/** + * @brief Destroy an image handle previously created using ::hsa_ext_image_create or + * ::hsa_ext_image_create_with_layout. + * + * @details Destroying the image handle does not free the associated image data, + * or modify its contents. The application should not destroy an image handle while + * there are references to it queued for execution or currently being used in a + * kernel dispatch. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] image Image handle to destroy. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + */ +hsa_status_t HSA_API hsa_ext_image_destroy( + hsa_agent_t agent, + hsa_ext_image_t image); + +/** + * @brief Copies a portion of one image (the source) to another image (the + * destination). + * + * @details The source and destination image formats should be the + * same, with the exception that s-form channel orders match the + * corresponding non-s-form channel order and vice versa. For example, + * it is allowed to copy a source image with a channel order of + * HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB to a destination image with a + * channel order of HSA_EXT_IMAGE_CHANNEL_ORDER_RGB. + * + * The source and destination images do not have to be of the same geometry and + * appropriate scaling is performed by the HSA runtime. It is possible to copy + * subregions between any combinations of source and destination geometries, provided + * that the dimensions of the subregions are the same. For example, it is + * allowed to copy a rectangular region from a 2D image to a slice of a 3D + * image. + * + * If the source and destination image data overlap, or the combination of + * offset and range references an out-out-bounds element in any of the images, + * the behavior is undefined. + * + * @param[in] agent Agent associated with both the source and destination image handles. + * + * @param[in] src_image Image handle of source image. The agent associated with the source + * image handle must be identical to that of the destination image. + * + * @param[in] src_offset Pointer to the offset within the source image where to + * copy the data from. Must not be NULL. + * + * @param[in] dst_image Image handle of destination image. + * + * @param[in] dst_offset Pointer to the offset within the destination + * image where to copy the data. Must not be NULL. + * + * @param[in] range Dimensions of the image portion to be copied. The HSA + * runtime computes the size of the image data to be copied using this + * argument. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_offset is + * NULL, @p dst_offset is NULL, or @p range is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_copy( + hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range); + +/** + * @brief Image region. + */ +typedef struct hsa_ext_image_region_s { + /** + * Offset within an image (in coordinates). + */ + hsa_dim3_t offset; + + /** + * Dimension size of the image range (in coordinates). The x, y, and z dimensions + * correspond to width, height, and depth or index respectively. + */ + hsa_dim3_t range; +} hsa_ext_image_region_t; + +/** + * @brief Import a linearly organized image data from memory directly to an + * image handle. + * + * @details This operation updates the image data referenced by the image handle + * from the source memory. The size of the data imported from memory is + * implicitly derived from the image region. + * + * It is the application's responsibility to avoid out of bounds memory access. + * + * None of the source memory or destination image data memory can + * overlap. Overlapping of any of the source and destination image + * data memory within the import operation produces undefined results. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] src_memory Source memory. Must not be NULL. + * + * @param[in] src_row_pitch The size in bytes of a single row of the image in the + * source memory. If the value is smaller than the destination image region + * width * image element byte size, then region width * image element byte + * size is used. + * + * @param[in] src_slice_pitch The size in bytes of a single 2D slice of a 3D image, + * or the size in bytes of each image layer in an image array in the source memory. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the + * value used for @p src_row_pitch, then the value used for @p src_row_pitch is used. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for + * @p src_row_pitch * destination image region height, then the value used for + * @p src_row_pitch * destination image region height is used. + * Otherwise, the value is not used. + * + * @param[in] dst_image Image handle of destination image. + * + * @param[in] image_region Pointer to the image region to be updated. Must not + * be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_memory is NULL, or @p + * image_region is NULL. + * + */ +hsa_status_t HSA_API hsa_ext_image_import( + hsa_agent_t agent, + const void *src_memory, + size_t src_row_pitch, + size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Export the image data to linearly organized memory. + * + * @details The operation updates the destination memory with the image data of + * @p src_image. The size of the data exported to memory is implicitly derived + * from the image region. + * + * It is the application's responsibility to avoid out of bounds memory access. + * + * None of the destination memory or source image data memory can + * overlap. Overlapping of any of the source and destination image + * data memory within the export operation produces undefined results. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] src_image Image handle of source image. + * + * @param[in] dst_memory Destination memory. Must not be NULL. + * + * @param[in] dst_row_pitch The size in bytes of a single row of the image in the + * destination memory. If the value is smaller than the source image region + * width * image element byte size, then region width * image element byte + * size is used. + * + * @param[in] dst_slice_pitch The size in bytes of a single 2D slice of a 3D image, + * or the size in bytes of each image in an image array in the destination memory. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the + * value used for @p dst_row_pitch, then the value used for @p dst_row_pitch is used. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for + * @p dst_row_pitch * source image region height, then the value used for + * @p dst_row_pitch * source image region height is used. + * Otherwise, the value is not used. + * + * @param[in] image_region Pointer to the image region to be exported. Must not + * be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p dst_memory is NULL, or @p + * image_region is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_export( + hsa_agent_t agent, + hsa_ext_image_t src_image, + void *dst_memory, + size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Clear a region of an image so that every image element has + * the specified value. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] image Image handle for image to be cleared. + * + * @param[in] data The value to which to set each image element being + * cleared. It is specified as an array of image component values. The + * number of array elements must match the number of access components + * for the image channel order. The type of each array element must + * match the image access type of the image channel type. When the + * value is used to set the value of an image element, the conversion + * method corresponding to the image channel type is used. See the + * Channel Order section and Channel Type section in + * the HSA Programming Reference Manual for more + * information. Must not be NULL. + * + * @param[in] image_region Pointer to the image region to clear. Must not be + * NULL. If the region references an out-out-bounds element, the behavior is + * undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p data is NULL, or @p + * image_region is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_clear( + hsa_agent_t agent, + hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Sampler handle. Samplers are populated by + * ::hsa_ext_sampler_create or ::hsa_ext_sampler_create_v2. Sampler handles are only unique + * within an agent, not across agents. + */ +typedef struct hsa_ext_sampler_s { + /** + * Opaque handle. For a given agent, two handles reference the same object of + * the enclosing type if and only if they are equal. + */ + uint64_t handle; +} hsa_ext_sampler_t; + +/** + * @brief Sampler address modes. The sampler address mode describes + * the processing of out-of-range image coordinates. See the + * Addressing Mode section in the HSA Programming Reference + * Manual for definitions on each address mode. The values + * match the BRIG type @p hsa_ext_brig_sampler_addressing_t. + */ +typedef enum { + /** + * Out-of-range coordinates are not handled. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED = 0, + + /** + * Clamp out-of-range coordinates to the image edge. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE = 1, + + /** + * Clamp out-of-range coordinates to the image border color. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER = 2, + + /** + * Wrap out-of-range coordinates back into the valid coordinate + * range so the image appears as repeated tiles. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT = 3, + + /** + * Mirror out-of-range coordinates back into the valid coordinate + * range so the image appears as repeated tiles with every other + * tile a reflection. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT = 4 + +} hsa_ext_sampler_addressing_mode_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_sampler_addressing_mode_t constants. + */ +typedef uint32_t hsa_ext_sampler_addressing_mode32_t; + +/** + * @brief Sampler coordinate normalization modes. See the + * Coordinate Normalization Mode section in the HSA + * Programming Reference Manual for definitions on each + * coordinate normalization mode. The values match the BRIG type @p + * hsa_ext_brig_sampler_coord_normalization_t. + */ +typedef enum { + + /** + * Coordinates are used to directly address an image element. + */ + HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED = 0, + + /** + * Coordinates are scaled by the image dimension size before being + * used to address an image element. + */ + HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED = 1 + +} hsa_ext_sampler_coordinate_mode_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_sampler_coordinate_mode_t constants. + */ +typedef uint32_t hsa_ext_sampler_coordinate_mode32_t; + + +/** + * @brief Sampler filter modes. See the Filter Mode section + * in the HSA Programming Reference Manual for definitions + * on each address mode. The enumeration values match the BRIG type @p + * hsa_ext_brig_sampler_filter_t. + */ +typedef enum { + /** + * Filter to the image element nearest (in Manhattan distance) to the + * specified coordinate. + */ + HSA_EXT_SAMPLER_FILTER_MODE_NEAREST = 0, + + /** + * Filter to the image element calculated by combining the elements in a 2x2 + * square block or 2x2x2 cube block around the specified coordinate. The + * elements are combined using linear interpolation. + */ + HSA_EXT_SAMPLER_FILTER_MODE_LINEAR = 1 + +} hsa_ext_sampler_filter_mode_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_sampler_filter_mode_t constants. + */ +typedef uint32_t hsa_ext_sampler_filter_mode32_t; + +/** + * @brief Implementation independent sampler descriptor. + */ +typedef struct hsa_ext_sampler_descriptor_s { + /** + * Sampler coordinate mode describes the normalization of image coordinates. + */ + hsa_ext_sampler_coordinate_mode32_t coordinate_mode; + + /** + * Sampler filter type describes the type of sampling performed. + */ + hsa_ext_sampler_filter_mode32_t filter_mode; + + /** + * Sampler address mode describes the processing of out-of-range image + * coordinates. + */ + hsa_ext_sampler_addressing_mode32_t address_mode; +} hsa_ext_sampler_descriptor_t; + +/** + * @brief Implementation independent sampler descriptor v2 which supports + * different address modes in X, Y and Z axises. + */ +typedef struct hsa_ext_sampler_descriptor_v2_s { + /** + * Sampler coordinate mode describes the normalization of image coordinates. + */ + hsa_ext_sampler_coordinate_mode32_t coordinate_mode; + + /** + * Sampler filter type describes the type of sampling performed. + */ + hsa_ext_sampler_filter_mode32_t filter_mode; + + /** + * Sampler address mode describes the processing of out-of-range image + * coordinates. + */ + hsa_ext_sampler_addressing_mode32_t address_modes[3]; // in X, Y and Z axises +} hsa_ext_sampler_descriptor_v2_t; + +/** + * @brief Create an agent specific sampler handle for a given agent + * independent sampler descriptor and agent. + * + * @param[in] agent Agent to be associated with the sampler handle created. + * + * @param[in] sampler_descriptor Pointer to a sampler descriptor. Must not be + * NULL. + * + * @param[out] sampler Memory location where the HSA runtime stores the newly + * created sampler handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED The + * @p agent does not have the capability to support the properties + * specified by @p sampler_descriptor or it is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or + * @p sampler is NULL. + */ +hsa_status_t HSA_API hsa_ext_sampler_create( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + +/** + * @brief Create an agent specific sampler handle for a given agent + * independent sampler descriptor v2 and agent. + * + * @param[in] agent Agent to be associated with the sampler handle created. + * + * @param[in] sampler_descriptor v2 Pointer to a sampler descriptor. Must not be + * NULL. + * + * @param[out] sampler Memory location where the HSA runtime stores the newly + * created sampler handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED The + * @p agent does not have the capability to support the properties + * specified by @p sampler_descriptor or it is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or + * @p sampler is NULL. + */ +hsa_status_t HSA_API hsa_ext_sampler_create_v2( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_v2_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + +/** + * @brief Destroy a sampler handle previously created using ::hsa_ext_sampler_create or + * ::hsa_ext_sampler_create_v2. + * + * @details The sampler handle should not be destroyed while there are + * references to it queued for execution or currently being used in a + * kernel dispatch. + * + * @param[in] agent Agent associated with the sampler handle. + * + * @param[in] sampler Sampler handle to destroy. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + */ +hsa_status_t HSA_API hsa_ext_sampler_destroy( + hsa_agent_t agent, + hsa_ext_sampler_t sampler); + + +#define hsa_ext_images_1_00 + +/** + * @brief The function pointer table for the images v1.00 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ext_images_1_00_pfn_s { + + hsa_status_t (*hsa_ext_image_get_capability)( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + + hsa_status_t (*hsa_ext_image_destroy)( + hsa_agent_t agent, + hsa_ext_image_t image); + + hsa_status_t (*hsa_ext_image_copy)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range); + + hsa_status_t (*hsa_ext_image_import)( + hsa_agent_t agent, + const void *src_memory, + size_t src_row_pitch, + size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_export)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + void *dst_memory, + size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_clear)( + hsa_agent_t agent, + hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_sampler_create)( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + + hsa_status_t (*hsa_ext_sampler_destroy)( + hsa_agent_t agent, + hsa_ext_sampler_t sampler); + +} hsa_ext_images_1_00_pfn_t; + +#define hsa_ext_images_1 + +/** + * @brief The function pointer table for the images v1 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ext_images_1_pfn_s { + + hsa_status_t (*hsa_ext_image_get_capability)( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + + hsa_status_t (*hsa_ext_image_destroy)( + hsa_agent_t agent, + hsa_ext_image_t image); + + hsa_status_t (*hsa_ext_image_copy)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range); + + hsa_status_t (*hsa_ext_image_import)( + hsa_agent_t agent, + const void *src_memory, + size_t src_row_pitch, + size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_export)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + void *dst_memory, + size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_clear)( + hsa_agent_t agent, + hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_sampler_create)( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + + hsa_status_t (*hsa_ext_sampler_destroy)( + hsa_agent_t agent, + hsa_ext_sampler_t sampler); + + hsa_status_t (*hsa_ext_image_get_capability_with_layout)( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + hsa_ext_image_data_layout_t image_data_layout, + uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info_with_layout)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create_with_layout)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_t *image); + + hsa_status_t (*hsa_ext_sampler_create_v2)( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_v2_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + +} hsa_ext_images_1_pfn_t; +/** @} */ + +#ifdef __cplusplus +} // end extern "C" block +#endif /*__cplusplus*/ + +#endif diff --git a/hsa/hsa_ven_amd_aqlprofile.h b/hsa/hsa_ven_amd_aqlprofile.h new file mode 100644 index 0000000000..a49221c49e --- /dev/null +++ b/hsa/hsa_ven_amd_aqlprofile.h @@ -0,0 +1,488 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_ +#define OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_ + +#include +#include "hsa.h" + +#define HSA_AQLPROFILE_VERSION_MAJOR 2 +#define HSA_AQLPROFILE_VERSION_MINOR 0 + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +//////////////////////////////////////////////////////////////////////////////// +// Library version +uint32_t hsa_ven_amd_aqlprofile_version_major(); +uint32_t hsa_ven_amd_aqlprofile_version_minor(); + +/////////////////////////////////////////////////////////////////////// +// Library API: +// The library provides helper methods for instantiation of +// the profile context object and for populating of the start +// and stop AQL packets. The profile object contains a profiling +// events list and needed for profiling buffers descriptors, +// a command buffer and an output data buffer. To check if there +// was an error the library methods return a status code. Also +// the library provides methods for querying required buffers +// attributes, to validate the event attributes and to get profiling +// output data. +// +// Returned status: +// hsa_status_t – HSA status codes are used from hsa.h header +// +// Supported profiling features: +// +// Supported profiling events +typedef enum { + HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC = 0, + HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE = 1, +} hsa_ven_amd_aqlprofile_event_type_t; + +// Supported performance counters (PMC) blocks +// The block ID is the same for a block instances set, for example +// each block instance from the TCC block set, TCC0, TCC1, …, TCCN +// will have the same block ID HSA_VEN_AMD_AQLPROFILE_BLOCKS_TCC. +typedef enum { + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC = 0, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF = 1, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS = 2, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM = 3, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE = 4, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI = 5, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ = 6, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS = 7, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM = 8, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX = 9, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA = 10, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA = 11, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC = 12, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP = 13, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD = 14, + // Memory related blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB = 15, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB = 16, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM = 17, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ = 18, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2 = 19, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR = 20, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC = 21, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2 = 22, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA = 23, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB = 24, + // System blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA = 25, + // GFX10 added blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A = 26, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C = 27, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A = 28, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C = 29, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR = 30, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS = 31, + + // UMC & MMEA System Blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC = 32, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA = 33, + + HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER +} hsa_ven_amd_aqlprofile_block_name_t; + +// PMC event object structure +// ‘counter_id’ value is specified in GFXIPs perfcounter user guides +// which is the counters select value, “Performance Counters Selection” +// chapter. +typedef struct { + hsa_ven_amd_aqlprofile_block_name_t block_name; + uint32_t block_index; + uint32_t counter_id; +} hsa_ven_amd_aqlprofile_event_t; + +// Check if event is valid for the specific GPU +hsa_status_t hsa_ven_amd_aqlprofile_validate_event( + hsa_agent_t agent, // HSA handle for the profiling GPU + const hsa_ven_amd_aqlprofile_event_t* event, // [in] Pointer on validated event + bool* result); // [out] True if the event valid, False otherwise + +// Profiling parameters +// All parameters are generic and if not applicable for a specific +// profile configuration then error status will be returned. +typedef enum { + /** + * Select the target compute unit (wgp) for profiling. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET = 0, + /** + * VMID Mask + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK = 1, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK = 2, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK = 3, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2 = 4, + /** + * Shader engine mask for selection. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK = 5, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE = 6, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT = 7, + /** + * Set SIMD Mask (GFX9) or SIMD ID for collection (Navi) + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION = 8, + /** + * Set true for occupancy collection only. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_OCCUPANCY_MODE = 9, + /** + * ATT collection max data size, in MB. Shared among shader engines. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE = 10, + /** + * Mask of which compute units to generate perfcounters. GFX9 only. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK = 240, + /** + * Select collection period for perfcounters. GFX9 only. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_CTRL = 241, + /** + * Select perfcounter ID (SQ block) for collection. GFX9 only. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_NAME = 242, +} hsa_ven_amd_aqlprofile_parameter_name_t; + +// Profile parameter object +typedef struct { + hsa_ven_amd_aqlprofile_parameter_name_t parameter_name; + uint32_t value; +} hsa_ven_amd_aqlprofile_parameter_t; + +typedef enum { + HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_0 = 0, + HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_1, + HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_2, + HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_3 +} hsa_ven_amd_aqlprofile_att_marker_channel_t; + +// +// Profile context object: +// The library provides a profile object structure which contains +// the events array, a buffer for the profiling start/stop commands +// and a buffer for the output data. +// The buffers are specified by the buffer descriptors and allocated +// by the application. The buffers allocation attributes, the command +// buffer size, the PMC output buffer size as well as profiling output +// data can be get using the generic get profile info helper _get_info. +// +// Buffer descriptor +typedef struct { + void* ptr; + uint32_t size; +} hsa_ven_amd_aqlprofile_descriptor_t; + +// Profile context object structure, contains profiling events list and +// needed for profiling buffers descriptors, a command buffer and +// an output data buffer +typedef struct { + hsa_agent_t agent; // GFXIP handle + hsa_ven_amd_aqlprofile_event_type_t type; // Events type + const hsa_ven_amd_aqlprofile_event_t* events; // Events array + uint32_t event_count; // Events count + const hsa_ven_amd_aqlprofile_parameter_t* parameters; // Parameters array + uint32_t parameter_count; // Parameters count + hsa_ven_amd_aqlprofile_descriptor_t output_buffer; // Output buffer + hsa_ven_amd_aqlprofile_descriptor_t command_buffer; // PM4 commands +} hsa_ven_amd_aqlprofile_profile_t; + +// +// AQL packets populating methods: +// The helper methods to populate provided by the application START and +// STOP AQL packets which the application is required to submit before and +// after profiled GPU task packets respectively. +// +// AQL Vendor Specific packet which carries a PM4 command +typedef struct { + uint16_t header; + uint16_t pm4_command[27]; + hsa_signal_t completion_signal; +} hsa_ext_amd_aql_pm4_packet_t; + +// Method to populate the provided AQL packet with profiling start commands +// Only 'pm4_command' fields of the packet are set and the application +// is responsible to set Vendor Specific header type a completion signal +hsa_status_t hsa_ven_amd_aqlprofile_start( + hsa_ven_amd_aqlprofile_profile_t* profile, // [in,out] profile context object + hsa_ext_amd_aql_pm4_packet_t* aql_start_packet); // [out] profile start AQL packet + +// Method to populate the provided AQL packet with profiling stop commands +// Only 'pm4_command' fields of the packet are set and the application +// is responsible to set Vendor Specific header type and a completion signal +hsa_status_t hsa_ven_amd_aqlprofile_stop( + const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object + hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet); // [out] profile stop AQL packet + +// Method to populate the provided AQL packet with profiling read commands +// Only 'pm4_command' fields of the packet are set and the application +// is responsible to set Vendor Specific header type and a completion signal +hsa_status_t hsa_ven_amd_aqlprofile_read( + const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object + hsa_ext_amd_aql_pm4_packet_t* aql_read_packet); // [out] profile stop AQL packet + +// Legacy devices, PM4 profiling packet size +const unsigned HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE = 192; +// Legacy devices, converting the profiling AQL packet to PM4 packet blob +hsa_status_t hsa_ven_amd_aqlprofile_legacy_get_pm4( + const hsa_ext_amd_aql_pm4_packet_t* aql_packet, // [in] AQL packet + void* data); // [out] PM4 packet blob + +// Method to add a marker (correlation ID) into the ATT buffer. +hsa_status_t hsa_ven_amd_aqlprofile_att_marker( + hsa_ven_amd_aqlprofile_profile_t* profile, // [in,out] profile context object + hsa_ext_amd_aql_pm4_packet_t* aql_marker_packet, // [out] profile marker AQL packet + uint32_t data, // [in] Data to be inserted + hsa_ven_amd_aqlprofile_att_marker_channel_t channel); // [in] Comm channel + +// +// Get profile info: +// Generic method for getting various profile info including profile buffers +// attributes like the command buffer size and the profiling PMC results. +// It’s implied that all counters are 64bit values. +// +// Profile generic output data: +typedef struct { + uint32_t sample_id; // PMC sample or trace buffer index + union { + struct { + hsa_ven_amd_aqlprofile_event_t event; // PMC event + uint64_t result; // PMC result + } pmc_data; + hsa_ven_amd_aqlprofile_descriptor_t trace_data; // Trace output data descriptor + }; +} hsa_ven_amd_aqlprofile_info_data_t; + +// ID query type +typedef struct { + const char* name; + uint32_t id; + uint32_t instance_count; +} hsa_ven_amd_aqlprofile_id_query_t; + +// Profile attributes +typedef enum { + HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE = 0, // get_info returns uint32_t value + HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE = 1, // get_info returns uint32_t value + HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA = 2, // get_info returns PMC uint64_t value + // in info_data object + HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA = 3, // get_info returns trace buffer ptr/size + // in info_data object + HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS = 4, // get_info returns number of block counter + HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID = 5, // get_info returns block id, instances + // by name string using _id_query_t + HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD = 6, // get_info returns size/pointer for + // counters enable command buffer + HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD = 7, // get_info returns size/pointer for + // counters disable command buffer +} hsa_ven_amd_aqlprofile_info_type_t; + + +// Definition of output data iterator callback +typedef hsa_status_t (*hsa_ven_amd_aqlprofile_data_callback_t)( + hsa_ven_amd_aqlprofile_info_type_t info_type, // [in] data type, PMC or trace data + hsa_ven_amd_aqlprofile_info_data_t* info_data, // [in] info_data object + void* callback_data); // [in,out] data passed to the callback + +// Method for getting the profile info +hsa_status_t hsa_ven_amd_aqlprofile_get_info( + const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object + hsa_ven_amd_aqlprofile_info_type_t attribute, // [in] requested profile attribute + void* value); // [in,out] returned value + +// Method for iterating the events output data +hsa_status_t hsa_ven_amd_aqlprofile_iterate_data( + const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object + hsa_ven_amd_aqlprofile_data_callback_t callback, // [in] callback to iterate the output data + void* data); // [in,out] data passed to the callback + +// Return error string +hsa_status_t hsa_ven_amd_aqlprofile_error_string( + const char** str); // [out] pointer on the error string + +/** + * @brief Callback for iteration of all possible event coordinate IDs and coordinate names. + */ +typedef hsa_status_t(*hsa_ven_amd_aqlprofile_eventname_callback_t)(int id, const char* name); +/** + * @brief Iterate over all possible event coordinate IDs and their names. + */ +hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_ids(hsa_ven_amd_aqlprofile_eventname_callback_t); + +/** + * @brief Iterate over all event coordinates for a given agent_t and event_t. + * @param position A counting sequence indicating callback number. + * @param id Coordinate ID as in _iterate_event_ids. + * @param extent Coordinate extent indicating maximum allowed instances. + * @param coordinate The coordinate, in the range [0,extent-1]. + * @param name Coordinate name as in _iterate_event_ids. + * @param userdata Userdata returned from _iterate_event_coord function. + */ +typedef hsa_status_t(*hsa_ven_amd_aqlprofile_coordinate_callback_t)( + int position, + int id, + int extent, + int coordinate, + const char* name, + void* userdata +); + +/** + * @brief Iterate over all event coordinates for a given agent_t and event_t. + * @param[in] agent HSA agent. + * @param[in] event The event ID and block ID to iterate for. + * @param[in] sample_id aqlprofile_info_data_t.sample_id returned from _aqlprofile_iterate_data. + * @param[in] callback Callback function to return the coordinates. + * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback. + */ +hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_coord( + hsa_agent_t agent, + hsa_ven_amd_aqlprofile_event_t event, + uint32_t sample_id, + hsa_ven_amd_aqlprofile_coordinate_callback_t callback, + void* userdata +); + +/** + * @brief Extension version. + */ +#define hsa_ven_amd_aqlprofile_VERSION_MAJOR 1 +#define hsa_ven_amd_aqlprofile_LIB(suff) "libhsa-amd-aqlprofile" suff ".so" + +#ifdef HSA_LARGE_MODEL +static const char kAqlProfileLib[] = hsa_ven_amd_aqlprofile_LIB("64"); +#else +static const char kAqlProfileLib[] = hsa_ven_amd_aqlprofile_LIB(""); +#endif + +/** + * @brief Extension function table. + */ +typedef struct hsa_ven_amd_aqlprofile_1_00_pfn_s { + uint32_t (*hsa_ven_amd_aqlprofile_version_major)(); + uint32_t (*hsa_ven_amd_aqlprofile_version_minor)(); + + hsa_status_t (*hsa_ven_amd_aqlprofile_error_string)( + const char** str); + + hsa_status_t (*hsa_ven_amd_aqlprofile_validate_event)( + hsa_agent_t agent, + const hsa_ven_amd_aqlprofile_event_t* event, + bool* result); + + hsa_status_t (*hsa_ven_amd_aqlprofile_start)( + hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ext_amd_aql_pm4_packet_t* aql_start_packet); + + hsa_status_t (*hsa_ven_amd_aqlprofile_stop)( + const hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet); + + hsa_status_t (*hsa_ven_amd_aqlprofile_read)( + const hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ext_amd_aql_pm4_packet_t* aql_read_packet); + + hsa_status_t (*hsa_ven_amd_aqlprofile_legacy_get_pm4)( + const hsa_ext_amd_aql_pm4_packet_t* aql_packet, + void* data); + + hsa_status_t (*hsa_ven_amd_aqlprofile_get_info)( + const hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ven_amd_aqlprofile_info_type_t attribute, + void* value); + + hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_data)( + const hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ven_amd_aqlprofile_data_callback_t callback, + void* data); + + hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_event_ids)( + hsa_ven_amd_aqlprofile_eventname_callback_t + ); + + hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_event_coord)( + hsa_agent_t agent, + hsa_ven_amd_aqlprofile_event_t event, + uint32_t sample_id, + hsa_ven_amd_aqlprofile_coordinate_callback_t callback, + void* userdata + ); + + hsa_status_t (*hsa_ven_amd_aqlprofile_att_marker)( + hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ext_amd_aql_pm4_packet_t* aql_packet, + uint32_t data, + hsa_ven_amd_aqlprofile_att_marker_channel_t channel + ); +} hsa_ven_amd_aqlprofile_1_00_pfn_t; + +typedef hsa_ven_amd_aqlprofile_1_00_pfn_t hsa_ven_amd_aqlprofile_pfn_t; + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_ diff --git a/hsa/hsa_ven_amd_loader.h b/hsa/hsa_ven_amd_loader.h new file mode 100644 index 0000000000..47236c86e9 --- /dev/null +++ b/hsa/hsa_ven_amd_loader.h @@ -0,0 +1,667 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA AMD extension for additional loader functionality. + +#ifndef HSA_VEN_AMD_LOADER_H +#define HSA_VEN_AMD_LOADER_H + +#include "hsa.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** + * @brief Queries equivalent host address for given @p device_address, and + * records it in @p host_address. + * + * + * @details Contents of memory pointed to by @p host_address would be identical + * to contents of memory pointed to by @p device_address. Only difference + * between the two is host accessibility: @p host_address is always accessible + * from host, @p device_address might not be accessible from host. + * + * If @p device_address already points to host accessible memory, then the value + * of @p device_address is simply copied into @p host_address. + * + * The lifetime of @p host_address is the same as the lifetime of @p + * device_address, and both lifetimes are limited by the lifetime of the + * executable that is managing these addresses. + * + * + * @param[in] device_address Device address to query equivalent host address + * for. + * + * @param[out] host_address Pointer to application-allocated buffer to record + * queried equivalent host address in. + * + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p device_address is invalid or + * null, or @p host_address is null. + */ +hsa_status_t hsa_ven_amd_loader_query_host_address( + const void *device_address, + const void **host_address); + +/** + * @brief The storage type of the code object that is backing loaded memory + * segment. + */ +typedef enum { + /** + * Loaded memory segment is not backed by any code object (anonymous), as the + * case would be with BSS (uninitialized data). + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE = 0, + /** + * Loaded memory segment is backed by the code object that is stored in the + * file. + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE = 1, + /** + * Loaded memory segment is backed by the code object that is stored in the + * memory. + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY = 2 +} hsa_ven_amd_loader_code_object_storage_type_t; + +/** + * @brief Loaded memory segment descriptor. + * + * + * @details Loaded memory segment descriptor describes underlying loaded memory + * segment. Loaded memory segment is created/allocated by the executable during + * the loading of the code object that is backing underlying memory segment. + * + * The lifetime of underlying memory segment is limited by the lifetime of the + * executable that is managing underlying memory segment. + */ +typedef struct hsa_ven_amd_loader_segment_descriptor_s { + /** + * Agent underlying memory segment is allocated on. If the code object that is + * backing underlying memory segment is program code object, then 0. + */ + hsa_agent_t agent; + /** + * Executable that is managing this underlying memory segment. + */ + hsa_executable_t executable; + /** + * Storage type of the code object that is backing underlying memory segment. + */ + hsa_ven_amd_loader_code_object_storage_type_t code_object_storage_type; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then null; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then null-terminated + * filepath to the code object; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then host + * accessible pointer to the first byte of the code object. + */ + const void *code_object_storage_base; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then the length of + * the filepath to the code object (including null-terminating character); + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then the size, in + * bytes, of the memory occupied by the code object. + */ + size_t code_object_storage_size; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0; + * - other, then offset, in bytes, from the beginning of the code object to + * the first byte in the code object data is copied from. + */ + size_t code_object_storage_offset; + /** + * Starting address of the underlying memory segment. + */ + const void *segment_base; + /** + * Size, in bytes, of the underlying memory segment. + */ + size_t segment_size; +} hsa_ven_amd_loader_segment_descriptor_t; + +/** + * @brief Either queries loaded memory segment descriptors, or total number of + * loaded memory segment descriptors. + * + * + * @details If @p segment_descriptors is not null and @p num_segment_descriptors + * points to number that exactly matches total number of loaded memory segment + * descriptors, then queries loaded memory segment descriptors, and records them + * in @p segment_descriptors. If @p segment_descriptors is null and @p + * num_segment_descriptors points to zero, then queries total number of loaded + * memory segment descriptors, and records it in @p num_segment_descriptors. In + * all other cases returns appropriate error code (see below). + * + * The caller of this function is responsible for the allocation/deallocation + * and the lifetime of @p segment_descriptors and @p num_segment_descriptors. + * + * The lifetime of loaded memory segments that are described by queried loaded + * memory segment descriptors is limited by the lifetime of the executable that + * is managing loaded memory segments. + * + * Queried loaded memory segment descriptors are always self-consistent: they + * describe a complete set of loaded memory segments that are being backed by + * fully loaded code objects that are present at the time (i.e. this function + * is blocked until all executable manipulations are fully complete). + * + * + * @param[out] segment_descriptors Pointer to application-allocated buffer to + * record queried loaded memory segment descriptors in. Can be null if @p + * num_segment_descriptors points to zero. + * + * @param[in,out] num_segment_descriptors Pointer to application-allocated + * buffer that contains either total number of loaded memory segment descriptors + * or zero. + * + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p segment_descriptors is null + * while @p num_segment_descriptors points to non-zero number, @p + * segment_descriptors is not null while @p num_segment_descriptors points to + * zero, or @p num_segment_descriptors is null. + * + * @retval HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p num_segment_descriptors + * does not point to number that exactly matches total number of loaded memory + * segment descriptors. + */ +hsa_status_t hsa_ven_amd_loader_query_segment_descriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + +/** + * @brief Obtains the handle of executable to which the device address belongs. + * + * @details This method should not be used to obtain executable handle by using + * a host address. The executable returned is expected to be alive until its + * destroyed by the user. + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT The input is invalid or there + * is no exectuable found for this kernel code object. + */ +hsa_status_t hsa_ven_amd_loader_query_executable( + const void *device_address, + hsa_executable_t *executable); + +//===----------------------------------------------------------------------===// + +/** + * @brief Iterate over the loaded code objects in an executable, and invoke + * an application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per loaded code object. The + * HSA runtime passes three arguments to the callback: the executable, a + * loaded code object, and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and + * ::hsa_ven_amd_loader_executable_iterate_loaded_code_objects returns that + * status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + +/** + * @brief Loaded code object kind. + */ +typedef enum { + /** + * Program code object. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_PROGRAM = 1, + /** + * Agent code object. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT = 2 +} hsa_ven_amd_loader_loaded_code_object_kind_t; + +/** + * @brief Loaded code object attributes. + */ +typedef enum hsa_ven_amd_loader_loaded_code_object_info_e { + /** + * The executable in which this loaded code object is loaded. The + * type of this attribute is ::hsa_executable_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_EXECUTABLE = 1, + /** + * The kind of this loaded code object. The type of this attribute is + * ::uint32_t interpreted as ::hsa_ven_amd_loader_loaded_code_object_kind_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND = 2, + /** + * The agent on which this loaded code object is loaded. The + * value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND is + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT. The type of this + * attribute is ::hsa_agent_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_AGENT = 3, + /** + * The storage type of the code object reader used to load the loaded code object. + * The type of this attribute is ::uint32_t interpreted as a + * ::hsa_ven_amd_loader_code_object_storage_type_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE = 4, + /** + * The memory address of the first byte of the code object that was loaaded. + * The value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is + * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this + * attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_BASE = 5, + /** + * The memory size in bytes of the code object that was loaaded. + * The value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is + * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this + * attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_SIZE = 6, + /** + * The file descriptor of the code object that was loaaded. + * The value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is + * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE. The type of this + * attribute is ::int. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_FILE = 7, + /** + * The signed byte address difference of the memory address at which the code + * object is loaded minus the virtual address specified in the code object + * that is loaded. The value of this attribute is only defined if the + * executable in which the code object is loaded is froozen. The type of this + * attribute is ::int64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA = 8, + /** + * The base memory address at which the code object is loaded. This is the + * base address of the allocation for the lowest addressed segment of the code + * object that is loaded. Note that any non-loaded segments before the first + * loaded segment are ignored. The value of this attribute is only defined if + * the executable in which the code object is loaded is froozen. The type of + * this attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE = 9, + /** + * The byte size of the loaded code objects contiguous memory allocation. The + * value of this attribute is only defined if the executable in which the code + * object is loaded is froozen. The type of this attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE = 10, + /** + * The length of the URI in bytes, not including the NUL terminator. The type + * of this attribute is uint32_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH = 11, + /** + * The URI name from which the code object was loaded. The type of this + * attribute is a NUL terminated \p char* with the length equal to the value + * of ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH attribute. + * The URI name syntax is defined by the following BNF syntax: + * + * code_object_uri ::== file_uri | memory_uri + * file_uri ::== "file://" file_path [ range_specifier ] + * memory_uri ::== "memory://" process_id range_specifier + * range_specifier ::== [ "#" | "?" ] "offset=" number "&" "size=" number + * file_path ::== URI_ENCODED_OS_FILE_PATH + * process_id ::== DECIMAL_NUMBER + * number ::== HEX_NUMBER | DECIMAL_NUMBER | OCTAL_NUMBER + * + * ``number`` is a C integral literal where hexadecimal values are prefixed by + * "0x" or "0X", and octal values by "0". + * + * ``file_path`` is the file's path specified as a URI encoded UTF-8 string. + * In URI encoding, every character that is not in the regular expression + * ``[a-zA-Z0-9/_.~-]`` is encoded as two uppercase hexidecimal digits + * proceeded by "%". Directories in the path are separated by "/". + * + * ``offset`` is a 0-based byte offset to the start of the code object. For a + * file URI, it is from the start of the file specified by the ``file_path``, + * and if omitted defaults to 0. For a memory URI, it is the memory address + * and is required. + * + * ``size`` is the number of bytes in the code object. For a file URI, if + * omitted it defaults to the size of the file. It is required for a memory + * URI. + * + * ``process_id`` is the identity of the process owning the memory. For Linux + * it is the C unsigned integral decimal literal for the process ID (PID). + * + * For example: + * + * file:///dir1/dir2/file1 + * file:///dir3/dir4/file2#offset=0x2000&size=3000 + * memory://1234#offset=0x20000&size=3000 + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI = 12, +} hsa_ven_amd_loader_loaded_code_object_info_t; + +/** + * @brief Get the current value of an attribute for a given loaded code + * object. + * + * @param[in] loaded_code_object Loaded code object. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The loaded code object is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * loaded code object attribute, or @p value is NULL. + */ +hsa_status_t hsa_ven_amd_loader_loaded_code_object_get_info( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + +//===----------------------------------------------------------------------===// + +/** + * @brief Create a code object reader to operate on a file with size and offset. + * + * @param[in] file File descriptor. The file must have been opened by + * application with at least read permissions prior calling this function. The + * file must contain a vendor-specific code object. + * + * The file is owned and managed by the application; the lifetime of the file + * descriptor must exceed that of any associated code object reader. + * + * @param[in] size Size of the code object embedded in @p file. + * + * @param[in] offset 0-based offset relative to the beginning of the @p file + * that denotes the beginning of the code object embedded within the @p file. + * + * @param[out] code_object_reader Memory location to store the newly created + * code object reader handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is not opened with at least + * read permissions. This condition may also be reported as + * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER by the + * ::hsa_executable_load_agent_code_object function. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The bytes starting at offset + * do not form a valid code object. If file size is 0. Or offset > file size. + * This condition may also be reported as + * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT by the + * ::hsa_executable_load_agent_code_object function. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL. + */ +hsa_status_t +hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); + +//===----------------------------------------------------------------------===// + +/** + * @brief Iterate over the available executables, and invoke an + * application-defined callback on every iteration. While + * ::hsa_ven_amd_loader_iterate_executables is executing any calls to + * ::hsa_executable_create, ::hsa_executable_create_alt, or + * ::hsa_executable_destroy will be blocked. + * + * @param[in] callback Callback to be invoked once per executable. The HSA + * runtime passes two arguments to the callback: the executable and the + * application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_ven_amd_loader_iterate_executables returns that status value. If + * @p callback invokes ::hsa_executable_create, ::hsa_executable_create_alt, or + * ::hsa_executable_destroy then the behavior is undefined. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. +*/ +hsa_status_t +hsa_ven_amd_loader_iterate_executables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); + +//===----------------------------------------------------------------------===// + +/** + * @brief Extension version. + */ +#define hsa_ven_amd_loader 001003 + +/** + * @brief Extension function table version 1.00. + */ +typedef struct hsa_ven_amd_loader_1_00_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); +} hsa_ven_amd_loader_1_00_pfn_t; + +/** + * @brief Extension function table version 1.01. + */ +typedef struct hsa_ven_amd_loader_1_01_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); +} hsa_ven_amd_loader_1_01_pfn_t; + +/** + * @brief Extension function table version 1.02. + */ +typedef struct hsa_ven_amd_loader_1_02_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + + hsa_status_t + (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); +} hsa_ven_amd_loader_1_02_pfn_t; + +/** + * @brief Extension function table version 1.03. + */ +typedef struct hsa_ven_amd_loader_1_03_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + + hsa_status_t + (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); + + hsa_status_t + (*hsa_ven_amd_loader_iterate_executables)( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); +} hsa_ven_amd_loader_1_03_pfn_t; + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* HSA_VEN_AMD_LOADER_H */ diff --git a/hsa/hsa_ven_amd_pc_sampling.h b/hsa/hsa_ven_amd_pc_sampling.h new file mode 100644 index 0000000000..019f0ea5c9 --- /dev/null +++ b/hsa/hsa_ven_amd_pc_sampling.h @@ -0,0 +1,416 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_VEN_AMD_PC_SAMPLING_H +#define HSA_VEN_AMD_PC_SAMPLING_H + +#include "hsa.h" + +#ifdef __cplusplus +extern "C" { +#endif /*__cplusplus*/ + + +/** + * @brief HSA AMD Vendor PC Sampling APIs + * EXPERIMENTAL: All PC Sampling APIs are currently in an experimental phase and the APIs may be + * modified extensively in the future + */ + +/** + * @brief PC Sampling sample data for hosttrap sampling method + */ +typedef struct { + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroup_id_x; + uint32_t workgroup_id_y; + uint32_t workgroup_id_z; + uint32_t wave_in_wg : 6; + uint32_t chiplet : 3; // Currently not used + uint32_t reserved : 23; + uint32_t hw_id; + uint32_t reserved0; + uint64_t reserved1; + uint64_t timestamp; + uint64_t correlation_id; +} perf_sample_hosttrap_v1_t; + +/** + * @brief PC Sampling sample data for stochastic sampling method + */ +typedef struct { + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroup_id_x; + uint32_t workgroup_id_y; + uint32_t workgroup_id_z; + uint32_t wave_in_wg : 6; + uint32_t chiplet : 3; // Currently not used + uint32_t reserved : 23; + uint32_t hw_id; + uint32_t perf_snapshot_data; + uint32_t perf_snapshot_data1; + uint32_t perf_snapshot_data2; + uint64_t timestamp; + uint64_t correlation_id; +} perf_sample_snapshot_v1_t; + +/** + * @brief PC Sampling method kinds + */ +typedef enum { + HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1, + HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1 +} hsa_ven_amd_pcs_method_kind_t; + +/** + * @brief PC Sampling interval unit type + */ +typedef enum { + HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS, + HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES, + HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS +} hsa_ven_amd_pcs_units_t; + +/** + * @brief HSA callback function to perform the copy onto a destination buffer + * + * If data_size is 0, HSA will stop current copy operation and keep remaining data in internal + * buffers. Remaining contents of HSA internal buffers will be included in next + * hsa_ven_amd_pcs_data_ready_callback_t. HSA internal buffers can also be drained by calling + * hsa_ven_amd_pcs_flush. + * + * @param[in] hsa_callback_data private data to pass back to HSA. Provided in + * hsa_ven_amd_pcs_data_ready_callback_t + * + * @param[in] data_size size of destination buffer in bytes. + * @param[in] destination destination buffer + * @retval TBD: but could be used to indicate that there is no more data to be read. + * Or indicate an error and abort of current copy operations + */ +typedef hsa_status_t (*hsa_ven_amd_pcs_data_copy_callback_t)(void* hsa_callback_data, + size_t data_size, void* destination); + +/** + * @brief HSA callback function to to indicate that there is data ready to be copied + * + * When the client receives this callback, the client should call back @p data_copy_callback for HSA + * to perform the copy operation into an available buffer. @p data_copy_callback can be called back + * multiple times with smaller @p data_size to split the copy operation. + * + * This callback must not call ::hsa_ven_amd_pcs_flush. + * + * @param[in] client_callback_data client private data passed in via + * hsa_ven_amd_pcs_create/hsa_ven_amd_pcs_create_from_id + * @param[in] data_size size of data available to be copied + * @param[in] lost_sample_count number of lost samples since last call to + * hsa_ven_amd_pcs_data_ready_callback_t. + * @param[in] data_copy_callback callback function for HSA to perform the actual copy + * @param[in] hsa_callback_data private data to pass back to HSA + */ +typedef void (*hsa_ven_amd_pcs_data_ready_callback_t)( + void* client_callback_data, size_t data_size, size_t lost_sample_count, + hsa_ven_amd_pcs_data_copy_callback_t data_copy_callback, void* hsa_callback_data); + +/** + * @brief Opaque handle representing a sampling session. + * Two sessions having same handle value represent the same session + */ +typedef struct { + uint64_t handle; +} hsa_ven_amd_pcs_t; + +/** + * @brief PC Sampling configuration flag options + */ +typedef enum { + /* The interval for this sampling method have to be a power of 2 */ + HSA_VEN_AMD_PCS_CONFIGURATION_FLAGS_INTERVAL_POWER_OF_2 = (1 << 0) +} hsa_ven_amd_pcs_configuration_flags_t; + +/** + * @brief PC Sampling method information + * Used to provide client with list of supported PC Sampling methods + */ +typedef struct { + hsa_ven_amd_pcs_method_kind_t method; + hsa_ven_amd_pcs_units_t units; + size_t min_interval; + size_t max_interval; + uint64_t flags; +} hsa_ven_amd_pcs_configuration_t; + +/** + * @brief Callback function to iterate through list of supported PC Sampling configurations + * + * @param[in] configuration one entry for supported PC Sampling method and configuration options + * @param[in] callback_data client private callback data that was passed in when calling + * hsa_ven_amd_pcs_iterate_configuration + */ +typedef hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration_callback_t)( + const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data); + +/** + * @brief Iterate through list of current supported PC Sampling configurations for this @p agent + * + * HSA will callback @p configuration_callback for each currently available PC Sampling + * configuration. The list of currently available configurations may not be the complete list of + * configurations supported on the @p agent. The list of currently available configurations may be + * reduced if the @p agent is currently handling other PC sampling sessions. + * + * @param[in] agent target agent + * @param[in] configuration_callback callback function to iterate through list of configurations + * @param[in] callback_data client private callback data + **/ +hsa_status_t hsa_ven_amd_pcs_iterate_configuration( + hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data); + +/** + * @brief Create a PC Sampling session on @p agent + * + * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval + * parameters must be a legal configuration value, as described by the + * hsa_ven_amd_pcs_configuration_t configurations passed to the callbacks of + * hsa_ven_amd_pcs_iterate_configuration for this @p agent. + * A successfull call may restrict the list of possible PC sampling methods available to subsequent + * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations + * on what types of PC sampling they can perform concurrently. + * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session. + * The session will be in a stopped/inactive state after this call + * + * @param[in] agent target agent + * @param[in] method method to use + * @param[in] units sampling units + * @param[in] interval sampling interval in @p units + * @param[in] latency expected latency in microseconds for client to provide a buffer for the data + * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the + * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate + * how many samples are received within @p latency and call @p data_ready_callback ahead of time so + * that the client has @p latency time to allocate the buffer before the HSA-runtime internal + * buffers are full. The value of latency can be 0. + * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once + * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of + * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t. + * @param[in] data_ready_callback client callback function that will be called when: + * 1. There is enough samples fill a buffer with @p buffer_size - estimated samples received + * within @p latency period. + * OR + * 2. When hsa_ven_amd_pcs_flush is called. + * @param[in] client_callback_data client private data to be provided back when data_ready_callback + * is called. + * @param[out] pc_sampling PC sampling session handle used to reference this session when calling + * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy + * + * @retval ::HSA_STATUS_SUCCESS session created successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters + * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and + * cannot handle the type requested. + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources + * @retval ::HSA_STATUS_ERROR Unexpected error + **/ +hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling); + + +/** + * @brief Creates a PC Sampling session on @p agent. Assumes that the caller provides the + * @p pcs_id generated by the previous call to the underlying driver that reserved PC sampling + * on the @p agent. + * + * Similar to the @ref hsa_ven_amd_pcs_create with the difference that it inherits an existing + * PC sampling session that was previously created in the underlying driver. + * + * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval + * parameters must be a legal configuration value, and match the parameters that we used to create + * the underlying PC Sampling session in the underlying driver. + * A successfull call may restrict the list of possible PC sampling methods available to subsequent + * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations + * on what types of PC sampling they can perform concurrently. + * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session. + * The session will be in a stopped/inactive state after this call + * + * @param[in] pcs_id ID that uniquely identifies the PC sampling session within underlying driver + * @param[in] agent target agent + * @param[in] method method to use + * @param[in] units sampling units + * @param[in] interval sampling interval in @p units + * @param[in] latency expected latency in microseconds for client to provide a buffer for the data + * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the + * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate + * how many samples are received within @p latency and call @p data_ready_callback ahead of time so + * that the client has @p latency time to allocate the buffer before the HSA-runtime internal + * buffers are full. The value of latency can be 0. + * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once + * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of + * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t. + * @param[in] data_ready_callback client callback function that will be called when: + * 1. There is enough samples fill a buffer with @p buffer_size - estimated samples received + * within @p latency period. + * OR + * 2. When hsa_ven_amd_pcs_flush is called. + * @param[in] client_callback_data client private data to be provided back when data_ready_callback + * is called. + * @param[out] pc_sampling PC sampling session handle used to reference this session when calling + * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy + * + * @retval ::HSA_STATUS_SUCCESS session created successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters + * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and + * cannot handle the type requested. + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources + * @retval ::HSA_STATUS_ERROR Unexpected error + **/ +hsa_status_t hsa_ven_amd_pcs_create_from_id( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + +/** + * @brief Free a PC Sampling session on @p agent + * + * Free all the resources allocated for a PC Sampling session on @p agent + * Internal buffers for this session will be lost. + * If the session was active, the session will be stopped before it is destroyed. + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session destroyed successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + * @retval ::HSA_STATUS_ERROR unexpected error + */ +hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Start a PC Sampling session + * + * Activate a PC Sampling session that was previous created. + * The session with be in a active state after this call + * If the session was already active, this will result in a no-op and will return HSA_STATUS_SUCCESS + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session started successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + * @retval ::HSA_STATUS_ERROR unexpected error + */ +hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Stop a PC Sampling session + * + * Stop a session that is currently active + * After a session is stopped HSA may still have some PC Sampling data in its internal buffers. + * The internal buffers can be drained using hsa_ven_amd_pcs_flush. If the internal + * buffers are not drained and the session is started again, the internal buffers will be available + * on the next data_ready_callback. + * If the session was already inactive, this will result in a no-op and will return + * HSA_STATUS_SUCCESS + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session stopped successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + */ +hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Flush internal buffers for a PC Sampling session + * + * Drain internal buffers for a PC Sampling session. If internal buffers have available data, + * this trigger a data_ready_callback. + * + * The function blocks until all PC samples associated with the @p pc_sampling session + * generated prior to the function call have been communicated by invocations of + * @p data_ready_callback having completed execution. + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session flushed successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + */ +hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling); + +#define hsa_ven_amd_pc_sampling_1_00 + +/** + * @brief The function pointer table for the PC Sampling v1.00 extension. Can be returned by + * ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ven_amd_pc_sampling_1_00_pfn_t { + hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration)( + hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data); + + hsa_status_t (*hsa_ven_amd_pcs_create)(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_create_from_id)( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_destroy)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_start)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_stop)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_flush)(hsa_ven_amd_pcs_t pc_sampling); + +} hsa_ven_amd_pc_sampling_1_00_pfn_t; + +#ifdef __cplusplus +} // end extern "C" block +#endif /*__cplusplus*/ + +#endif /* HSA_VEN_AMD_PC_SAMPLING_H */ From 28c81cffda3f87e8faec4eaa3e60a8ab9d93350a Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Fri, 16 May 2025 16:25:48 +0800 Subject: [PATCH 25/32] librocdxg: include rocr headers Signed-off-by: Flora Cui --- wddm/cmd_util.h | 6 +++--- wddm/queue.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/wddm/cmd_util.h b/wddm/cmd_util.h index 5be6c4764f..491c56e445 100644 --- a/wddm/cmd_util.h +++ b/wddm/cmd_util.h @@ -4,9 +4,9 @@ #define _WSL_INC_WDDM_CMD_UTIL_H_ #include -#include "hsa-runtime/inc/hsa.h" -#include "hsa-runtime/inc/amd_hsa_queue.h" -#include "hsa-runtime/inc/amd_hsa_kernel_code.h" +#include "impl/hsa/hsa.h" +#include "impl/hsa/amd_hsa_queue.h" +#include "impl/hsa/amd_hsa_kernel_code.h" #include "impl/pm4_cmds.h" #include "util/utils.h" diff --git a/wddm/queue.h b/wddm/queue.h index 8dfd1c9e4a..928ce98893 100644 --- a/wddm/queue.h +++ b/wddm/queue.h @@ -50,9 +50,9 @@ #include "impl/wddm/types.h" #include "impl/wddm/device.h" #include "impl/wddm/gpu_memory.h" -#include "hsa-runtime/inc/hsa_ext_amd.h" -#include "hsa-runtime/inc/amd_hsa_queue.h" -#include "hsa-runtime/inc/amd_hsa_signal.h" +#include "impl/hsa/hsa_ext_amd.h" +#include "impl/hsa/amd_hsa_queue.h" +#include "impl/hsa/amd_hsa_signal.h" #include "impl/wddm/cmd_util.h" namespace wsl { From bf818a2e75e68cf4198fa5439157187fe42f0335 Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Mon, 23 Jun 2025 16:45:58 +0800 Subject: [PATCH 26/32] librocdxg: update rocr queue type to amd_queue_v2_t Signed-off-by: Flora Cui --- wddm/cmd_util.h | 2 +- wddm/queue.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/wddm/cmd_util.h b/wddm/cmd_util.h index 491c56e445..5826c34afe 100644 --- a/wddm/cmd_util.h +++ b/wddm/cmd_util.h @@ -19,7 +19,7 @@ struct DispatchInfo { void *pEntry; const amd_kernel_code_t *pKernelObject; uint32_t ldsBlks; - amd_queue_t *pAmdQueue; + amd_queue_v2_t *pAmdQueue; bool wave32; uint32_t srd; void *pScratchBase; diff --git a/wddm/queue.h b/wddm/queue.h index 928ce98893..59da825ec2 100644 --- a/wddm/queue.h +++ b/wddm/queue.h @@ -237,8 +237,8 @@ private: uint64_t GetKernelObjAddr(uint64_t addr) const; void InitScratchSRD(); GpuMemoryHandle amd_queue_mem_; - amd_queue_t *amd_queue_; - amd_queue_t *amd_queue_rocr_; + amd_queue_v2_t *amd_queue_; + amd_queue_v2_t *amd_queue_rocr_; uint64_t doorbell_signal_value_; volatile std::atomic *error_code_; std::thread aql_to_pm4_thread_; From b5dd613ccd19026b43d533cf462b768c81bcdb68 Mon Sep 17 00:00:00 2001 From: Chengjun Yao Date: Wed, 10 Sep 2025 10:58:06 +0800 Subject: [PATCH 27/32] librocdxg: Integrate DXCore loader into WDDM thunks Replace direct D3DKMT API calls with DXCORE_CALL macro in WDDM thunk layer. This enables dynamic loading of DXCore functions while maintaining the same API interface. Updated thunk functions: - MapGpuVirtualAddress, CreateAllocation, DestroyAllocation - ReserveGpuVirtualAddress, FreeGpuVirtualAddress - MakeResident, Evict, ShareObjects - QueryResourceInfoFromNtHandle, OpenResourceFromNtHandle All existing functionality is preserved while adding flexibility for runtime DXCore availability detection. Signed-off-by: Chengjun Yao Signed-off-by: Yang Su Reviewed-by: Shi.Leslie --- wddm/thunks.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/wddm/thunks.h b/wddm/thunks.h index 9783eb0177..68f0015d6d 100644 --- a/wddm/thunks.h +++ b/wddm/thunks.h @@ -45,6 +45,7 @@ #include "impl/wddm/status.h" #include "impl/wddm/types.h" +#include "dxcore_loader.h" namespace wsl { namespace thunk { @@ -113,11 +114,11 @@ typedef D3DKMT_SUBMITWAITFORSYNCOBJECTSTOHWQUEUE SubmitWaitForSyncObjectsToH typedef D3DKMT_CREATESYNCFILE CreateSyncFileArgs; inline ErrorCode MapGpuVirtualAddress(D3DDDI_MAPGPUVIRTUALADDRESS *args) { - return TranslateNtStatus(D3DKMTMapGpuVirtualAddress(args)); + return TranslateNtStatus(DXCORE_CALL(D3DKMTMapGpuVirtualAddress(args))); } inline ErrorCode CreateAllocation(CreateAllocationArgs *args) { - return TranslateNtStatus(D3DKMTCreateAllocation2(args)); + return TranslateNtStatus(DXCORE_CALL(D3DKMTCreateAllocation2(args))); } inline ErrorCode DestroyAllocation( @@ -137,11 +138,11 @@ inline ErrorCode DestroyAllocation( args.AllocationCount = num_allocations; } - return TranslateNtStatus(D3DKMTDestroyAllocation2(&args)); + return TranslateNtStatus(DXCORE_CALL(D3DKMTDestroyAllocation2(&args))); } inline ErrorCode ReserveGpuVirtualAddress(D3DDDI_RESERVEGPUVIRTUALADDRESS *args) { - return TranslateNtStatus(D3DKMTReserveGpuVirtualAddress(args)); + return TranslateNtStatus(DXCORE_CALL(D3DKMTReserveGpuVirtualAddress(args))); } inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle, @@ -177,7 +178,7 @@ inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle, } inline ErrorCode FreeGpuVirtualAddress(FreeGpuVirtualAddressArgs *args) { - return TranslateNtStatus(D3DKMTFreeGpuVirtualAddress(args)); + return TranslateNtStatus(DXCORE_CALL(D3DKMTFreeGpuVirtualAddress(args))); } inline ErrorCode FreeGpuVirtualAddress(WinAdapterHandle handle, @@ -191,11 +192,11 @@ inline ErrorCode FreeGpuVirtualAddress(WinAdapterHandle handle, } inline ErrorCode MakeResident(D3DDDI_MAKERESIDENT *args) { - return TranslateNtStatus(D3DKMTMakeResident(args)); + return TranslateNtStatus(DXCORE_CALL(D3DKMTMakeResident(args))); } inline ErrorCode Evict(EvictArgs *args) { - return TranslateNtStatus(D3DKMTEvict(args)); + return TranslateNtStatus(DXCORE_CALL(D3DKMTEvict(args))); } inline ErrorCode ShareObjects(size_t num_allocations, @@ -207,8 +208,8 @@ inline ErrorCode ShareObjects(size_t num_allocations, ErrorCode ret; InitializeObjectAttributes(&obj_attr, nullptr, OBJ_INHERIT, nullptr, nullptr); - ret = TranslateNtStatus(D3DKMTShareObjects(num_allocations, - &resource, &obj_attr, flags, &nt_handle)); + ret = TranslateNtStatus(DXCORE_CALL(D3DKMTShareObjects(num_allocations, + &resource, &obj_attr, flags, &nt_handle))); if (ret == ErrorCode::Success) *dmabuf_fd = *(reinterpret_cast(&nt_handle)); else @@ -218,11 +219,11 @@ inline ErrorCode ShareObjects(size_t num_allocations, } inline ErrorCode QueryResourceInfoFromNtHandle(D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE *args) { - return TranslateNtStatus(D3DKMTQueryResourceInfoFromNtHandle(args)); + return TranslateNtStatus(DXCORE_CALL(D3DKMTQueryResourceInfoFromNtHandle(args))); } inline ErrorCode OpenResourceFromNtHandle(D3DKMT_OPENRESOURCEFROMNTHANDLE *args) { - return TranslateNtStatus(D3DKMTOpenResourceFromNtHandle(args)); + return TranslateNtStatus(DXCORE_CALL(D3DKMTOpenResourceFromNtHandle(args))); } } // namespace d3dthunk From 1bc5af684ce2fd70eea3f5895086136ea79924bb Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Thu, 23 Oct 2025 10:09:23 +0800 Subject: [PATCH 28/32] update hsa header Signed-off-by: Flora Cui --- hsa/hsa_amd_tool.h | 6 ++ hsa/hsa_api_trace.h | 2 + hsa/hsa_api_trace_version.h | 2 +- hsa/hsa_ext_amd.h | 123 +++++++++++++++++++++++++++++++++--- 4 files changed, 124 insertions(+), 9 deletions(-) diff --git a/hsa/hsa_amd_tool.h b/hsa/hsa_amd_tool.h index fa9cac804a..22847a8a44 100644 --- a/hsa/hsa_amd_tool.h +++ b/hsa/hsa_amd_tool.h @@ -1,3 +1,9 @@ +/* + * Copyright © Advanced Micro Devices, Inc., or its affiliates. + * + * SPDX-License-Identifier: MIT + */ + #ifndef HSA_RUNTIME_AMD_TOOL_EVENTS_H_ #define HSA_RUNTIME_AMD_TOOL_EVENTS_H_ diff --git a/hsa/hsa_api_trace.h b/hsa/hsa_api_trace.h index 6515b19700..cc33320269 100644 --- a/hsa/hsa_api_trace.h +++ b/hsa/hsa_api_trace.h @@ -271,6 +271,8 @@ struct AmdExtTable { decltype(hsa_amd_signal_wait_all)* hsa_amd_signal_wait_all_fn; decltype(hsa_amd_memory_get_preferred_copy_engine)* hsa_amd_memory_get_preferred_copy_engine_fn; decltype(hsa_amd_portable_export_dmabuf_v2)* hsa_amd_portable_export_dmabuf_v2_fn; + decltype(hsa_amd_ais_file_write)* hsa_amd_ais_file_write_fn; + decltype(hsa_amd_ais_file_read)* hsa_amd_ais_file_read_fn; }; // Table to export HSA Core Runtime Apis diff --git a/hsa/hsa_api_trace_version.h b/hsa/hsa_api_trace_version.h index befd1e26e3..6cf1054823 100644 --- a/hsa/hsa_api_trace_version.h +++ b/hsa/hsa_api_trace_version.h @@ -58,7 +58,7 @@ // Step Ids of the Api tables exported by Hsa Core Runtime #define HSA_API_TABLE_STEP_VERSION 0x01 #define HSA_CORE_API_TABLE_STEP_VERSION 0x00 -#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x07 +#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x08 #define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00 #define HSA_IMAGE_API_TABLE_STEP_VERSION 0x01 // Rocprofiler just checks HSA_MAGE_EXT_API_TABLE_STEP_VERSION diff --git a/hsa/hsa_ext_amd.h b/hsa/hsa_ext_amd.h index c402a51264..3fd1f9348e 100644 --- a/hsa/hsa_ext_amd.h +++ b/hsa/hsa_ext_amd.h @@ -62,9 +62,12 @@ * - 1.9 - hsa_amd_portable_export_dmabuf_v2 * - 1.10 - hsa_amd_vmem_address_reserve: HSA_AMD_VMEM_ADDRESS_NO_REGISTER * - 1.11 - hsa_amd_agent_info_t: HSA_AMD_AGENT_INFO_CLOCK_COUNTERS + * - 1.12 - hsa_amd_pointer_info: HSA_EXT_POINTER_TYPE_HSA_VMEM and HSA_EXT_POINTER_TYPE_RESERVED_ADDR + * - 1.13 - hsa_amd_pointer_info: Added new registered field to hsa_amd_pointer_info_t + * - 1.14 - hsa_amd_ais_file_write, hsa_amd_ais_file_read */ #define HSA_AMD_INTERFACE_VERSION_MAJOR 1 -#define HSA_AMD_INTERFACE_VERSION_MINOR 11 +#define HSA_AMD_INTERFACE_VERSION_MINOR 14 #ifdef __cplusplus extern "C" { @@ -1578,7 +1581,10 @@ typedef enum hsa_amd_memory_pool_flag_s { * Allocates executable memory */ HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG = (1 << 2), - + /** + * Allocates uncached memory + */ + HSA_AMD_MEMORY_POOL_UNCACHED_FLAG = (1 << 3), } hsa_amd_memory_pool_flag_t; /** @@ -1806,8 +1812,6 @@ hsa_status_t HSA_API * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Agent does not have available SDMA engines. * - * @retval ::HSA_STATUS_ERROR_INVALID_AGENT dst_agent and src_agent are the same as - * dst_agent == src_agent is generally used for shader copies. */ hsa_status_t HSA_API hsa_amd_memory_copy_engine_status(hsa_agent_t dst_agent, hsa_agent_t src_agent, @@ -1824,8 +1828,6 @@ hsa_amd_memory_copy_engine_status(hsa_agent_t dst_agent, hsa_agent_t src_agent, * * @retval ::HSA_STATUS_SUCCESS For mask returned * - * @retval ::HSA_STATUS_ERROR_INVALID_AGENT dst_agent and src_agent are the same as - * dst_agent == src_agent is generally used for shader copies. */ hsa_status_t HSA_API hsa_amd_memory_get_preferred_copy_engine(hsa_agent_t dst_agent, hsa_agent_t src_agent, @@ -2362,7 +2364,11 @@ typedef enum { /* No backend memory but virtual address */ - HSA_EXT_POINTER_TYPE_RESERVED_ADDR = 5 + HSA_EXT_POINTER_TYPE_RESERVED_ADDR = 5, + /* + Memory was allocated with an HSA virtual memory allocator + */ + HSA_EXT_POINTER_TYPE_HSA_VMEM = 6 } hsa_amd_pointer_type_t; /** @@ -2417,6 +2423,13 @@ typedef struct hsa_amd_pointer_info_s { meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. */ uint32_t global_flags; + + /* + Set to true if this allocation was registered with the underlying driver + This field is not meaningful if the type of the allocation is + HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + bool registered; } hsa_amd_pointer_info_t; /** @@ -3642,12 +3655,106 @@ typedef enum { hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute, void* value); +typedef struct hsa_amd_ais_file_handle_s { + /* + * file handle for AIS read & write. Linux will use fd. + * pad is keep the size consistent accross different platforms. + */ + union { + void* handle; + int fd; + uint8_t pad[8]; + }; +} hsa_amd_ais_file_handle_t; + +/** + * @brief Write data from device memory to a file + * + * Writes data from device memory buffer to a file at the specified offset. + * The device memory pointer must be accessible from the host and point to + * a valid allocation. + * + * EXPERIMENTAL: AIS read and write calls are currently in experimental phase and + * APIs may be modified + * + * @param[in] handle Handle of the file to write to. + * + * @param[in] devicePtr Device memory buffer pointer containing data to write. + * + * @param[in] size Size in bytes of the data to write. + * + * @param[in] file_offset Offset in bytes into the file where data will be written. + * + * @param[in/out] size_copied Actual number of bytes copied + * + * @param[in/out] status Additional status if any + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fd is invalid, @p devicePtr + * is NULL, or @p size is 0. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p devicePtr does not refer to + * a valid allocation. + * + * @retval ::HSA_STATUS_ERROR An error occurred during the write operation. + */ +hsa_status_t HSA_API hsa_amd_ais_file_write(hsa_amd_ais_file_handle_t handle, void *devicePtr, + uint64_t size, int64_t file_offset, + uint64_t *size_copied, int32_t *status); + +/** + * @brief Read data from a file to device memory + * + * Reads data from a file at the specified offset into a device memory buffer. + * The device memory pointer must be accessible from the host and point to + * a valid allocation. + * + * EXPERIMENTAL: AIS read and write calls are currently in experimental phase and + * APIs may be modified + * @param[in] hanlde Handle of the file to read from. + * + * @param[in] devicePtr Device memory buffer pointer to store the read data. + * + * @param[in] size Size in bytes of the data to read. + * + * @param[in] file_offset Offset in bytes into the file where data will be read from. + * + * @param[in/out] size_copied Actual number of bytes copied + * + * @param[in/out] status Additional status if any + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fd is invalid, @p devicePtr + * is NULL, or @p size is 0. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p devicePtr does not refer to + * a valid allocation. + * + * @retval ::HSA_STATUS_ERROR An error occurred during the read operation. + */ +hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, void *devicePtr, + uint64_t size, int64_t file_offset, + uint64_t *size_copied, int32_t *status); + /** * @brief logging types */ typedef enum hsa_amd_log_flag_s { - /* Log AQL packets internally enqueued by HSA for Blit Kernels */ + /* Log AQL packets internally enqueued by ROCr */ HSA_AMD_LOG_FLAG_BLIT_KERNEL_PKTS = 0, + HSA_AMD_LOG_FLAG_AQL = 0, + /* Log SDMA packets */ + HSA_AMD_LOG_FLAG_SDMA = 1, + /* Log INFO */ + HSA_AMD_LOG_FLAG_INFO = 2, } hsa_amd_log_flag_t; /** From 437e4b092e37421868c3e572f396c0d5294024e3 Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Tue, 18 Nov 2025 17:28:37 +0800 Subject: [PATCH 29/32] librocdxg: Convert all CmdUtil methods to static Signed-off-by: Flora Cui Reviewed-by: Longlong Yao Part-of: --- wddm/cmd_util.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/wddm/cmd_util.h b/wddm/cmd_util.h index 5826c34afe..f1e7d22d91 100644 --- a/wddm/cmd_util.h +++ b/wddm/cmd_util.h @@ -33,7 +33,7 @@ public: CmdUtil() {}; ~CmdUtil() {}; - size_t BuildCopyData( + static size_t BuildCopyData( uint64_t *pDstAddr, void *pBuffer, uint32_t dstSel = dst_sel__mec_copy_data__tc_l2, @@ -43,32 +43,32 @@ public: uint32_t countSel = count_sel__mec_copy_data__64_bits_of_data, uint32_t wrConfirm = wr_confirm__mec_copy_data__wait_for_confirmation); - size_t BuildBarrier( + static size_t BuildBarrier( void *pBuffer, uint32_t eventIndex = event_index__mec_event_write__cs_partial_flush, uint32_t eventType = CS_PARTIAL_FLUSH); - size_t BuildWriteData64Command( + static size_t BuildWriteData64Command( void *pBuffer, uint64_t* write_addr, uint64_t write_value); - size_t BuildAcquireMem( + static size_t BuildAcquireMem( uint8_t major, void *pBuffer); - size_t BuildScratch( + static size_t BuildScratch( void *pScratchBase, void *pBuffer); - size_t BuildComputeShaderParams( + static size_t BuildComputeShaderParams( void *pBuffer); - size_t BuildDispatch( + static size_t BuildDispatch( struct DispatchInfo *pInfo, void *pBuffer); - size_t BuildAtomicMem( + static size_t BuildAtomicMem( uint64_t *pAddr, uint32_t atomic, void *pBuffer, From a2c5e196243f882659b606d19237470dcd51cce5 Mon Sep 17 00:00:00 2001 From: Longlong Yao Date: Thu, 6 Nov 2025 17:31:47 +0800 Subject: [PATCH 30/32] librocdxg: add interface to query segment info Signed-off-by: Longlong Yao --- wddm/device.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/wddm/device.h b/wddm/device.h index 2ba615cfa9..ab873a4bc6 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -71,6 +71,18 @@ class WDDMQueue; #define IS_OVERLAPPING(start1, size1, start2, size2) \ ((start1 < (start2 + size2)) && (start2 < (start1 + size1))) +struct SegmentInfo { + uint32_t segment_id; + uint32_t segment_type; // 0=aperture, 1=gpu memory, 2=system memory + bool aperture; + bool system_memory; + uint64_t commit_limit; + + SegmentInfo() + : segment_id(0), segment_type(0), aperture(false), + system_memory(false), commit_limit(0) {} +}; + class WDDMDevice { public: static constexpr size_t GpuMemoryChunkSize = 2 * (1ULL << 30); // 2 GB @@ -203,6 +215,9 @@ private: void SetPowerOptimization(bool restore); void InitCmdbufInfo(void); + bool QuerySegmentInfo(); + bool GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE segment_type, uint32_t &segment_id); + D3DKMT_HANDLE adapter_; LUID adapter_luid_; D3DKMT_HANDLE device_; @@ -218,7 +233,7 @@ private: uint32_t node_id_; // device info thunk_proxy::DeviceInfo device_info_; - + std::vector segment_infos_; //CmdUtil cmd_util; }; From 56eeaf26f857b1c9432393ddb67040e59423b30f Mon Sep 17 00:00:00 2001 From: Longlong Yao Date: Mon, 27 Oct 2025 10:59:00 +0800 Subject: [PATCH 31/32] librocdxg: query total shared GPU memory Signed-off-by: Longlong Yao --- thunk_proxy/thunk_proxy.h | 1 + wddm/device.h | 1 + 2 files changed, 2 insertions(+) diff --git a/thunk_proxy/thunk_proxy.h b/thunk_proxy/thunk_proxy.h index 3eace0b6e5..d6bdce2451 100644 --- a/thunk_proxy/thunk_proxy.h +++ b/thunk_proxy/thunk_proxy.h @@ -70,6 +70,7 @@ typedef struct { uint32_t asic_revision; uint64_t local_visible_heap_size; uint64_t local_invisible_heap_size; + uint64_t non_local_heap_size; uint64_t private_aperture_base; uint64_t private_aperture_size; uint64_t shared_aperture_base; diff --git a/wddm/device.h b/wddm/device.h index ab873a4bc6..15821b5483 100644 --- a/wddm/device.h +++ b/wddm/device.h @@ -119,6 +119,7 @@ public: uint64_t LocalHeapSize() { return device_info_.local_visible_heap_size + device_info_.local_invisible_heap_size; } uint64_t LocalVisibleHeapSize() { return device_info_.local_visible_heap_size; } uint64_t LocalInvisibleHeapSize() { return device_info_.local_invisible_heap_size; } + uint64_t NonLocalHeapSize() { return device_info_.non_local_heap_size; } uint64_t PrivateApertureBase() { return device_info_.private_aperture_base; } uint64_t PrivateApertureSize() { return device_info_.private_aperture_size; } uint64_t SharedApertureBase() { return device_info_.shared_aperture_base; } From c34ec1e52fcb52da248c00207ebe646197ea9d3e Mon Sep 17 00:00:00 2001 From: Longlong Yao Date: Mon, 5 Jan 2026 14:58:54 +0800 Subject: [PATCH 32/32] wsl/librocdxg: Change scratch memory allocation Calculate the actual scratch memory size required based on the packet information for kernel dispatch. If the required size exceeds the total allocated memory, scratch memory must be reallocated. Otherwise, no action is needed. miopen_gtest: Full/GPU_MIOpenDriverRegressionTest_FP16.MIOpenDriverRegressionHalf/0 Signed-off-by: Longlong Yao Reviewed-by: Flora Cui Reviewed-by: Horatio Zhang --- wddm/queue.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/wddm/queue.h b/wddm/queue.h index 59da825ec2..0e936c5721 100644 --- a/wddm/queue.h +++ b/wddm/queue.h @@ -178,6 +178,10 @@ public: private: hsa_status_t KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet); hsa_status_t BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or = false); + + uint64_t CalcDispatchGroups(hsa_kernel_dispatch_packet_t *packet); + uint64_t CalcDispatchWavesPerGroup(hsa_kernel_dispatch_packet_t *packet, bool wave32); + struct amd_aql_pm4_ib { uint16_t header; uint16_t ven_hdr; @@ -221,7 +225,7 @@ private: return AMD_HSA_BITS_GET(amd_queue_rocr_->queue_properties, AMD_QUEUE_PROPERTIES_ENABLE_PROFILING); } void HandleError(hsa_status_t status); - bool UpdateScratch(uint32_t private_segment_size, bool wave32); + bool UpdateScratch(hsa_kernel_dispatch_packet_t *packet, bool wave32); uint32_t UpdateIndexStride(uint32_t srd, bool wave32); @@ -247,10 +251,13 @@ private: std::condition_variable thread_cond_; static void AqlToPm4Thread(ComputeQueue *queue); - uint32_t scratch_waves_; - uint32_t scratch_size_per_wave_; - uint32_t scratch_size_; + uint64_t max_scratch_waves_; + uint64_t dispatch_waves_; + uint64_t scratch_size_per_wave_; + uint64_t scratch_size_; + uint64_t total_scratch_size_; void *scratch_base_; + uint32_t scratch_mem_alignment_size_; GpuMemoryHandle scratch_mem_; std::vector scratch_base_offset_array_;