diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/Brig.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/Brig.h new file mode 100644 index 0000000000..4f34bd1d50 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/Brig.h @@ -0,0 +1,1131 @@ +// University of Illinois/NCSA +// Open Source License +// +// Copyright (c) 2013-2015, Advanced Micro Devices, Inc. +// All rights reserved. +// +// Developed by: +// +// HSA Team +// +// Advanced Micro Devices, Inc +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal with +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimers in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the names of the LLVM Team, University of Illinois at +// Urbana-Champaign, nor the names of its contributors may be used to +// endorse or promote products derived from this Software without specific +// prior written permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +// SOFTWARE. + +#ifndef INCLUDED_BRIG_H +#define INCLUDED_BRIG_H + +#include /* size_t */ +#include /* uintXX_t */ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/*========================================================================================*/ +/* =======================================================================================*/ +/* =======================================================================================*/ +/* =======================================================================================*/ + +typedef uint32_t BrigCodeOffset32_t; +typedef uint32_t BrigOperandOffset32_t; +typedef uint32_t BrigDataOffset32_t; + +typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t; +typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t; +typedef BrigDataOffset32_t BrigDataOffsetString32_t; + +typedef uint32_t BrigVersion32_t; +enum BrigVersion { + BRIG_VERSION_HSAIL_MAJOR = 1, + BRIG_VERSION_HSAIL_MINOR = 0, + BRIG_VERSION_BRIG_MAJOR = 1, + BRIG_VERSION_BRIG_MINOR = 0 +}; + +typedef uint16_t BrigKind16_t; +enum BrigKind { + BRIG_KIND_NONE = 0x0000, + + BRIG_KIND_DIRECTIVE_BEGIN = 0x1000, + BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000, + BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001, + BRIG_KIND_DIRECTIVE_COMMENT = 0x1002, + BRIG_KIND_DIRECTIVE_CONTROL = 0x1003, + BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004, + BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005, + BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006, + BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007, + BRIG_KIND_DIRECTIVE_KERNEL = 0x1008, + BRIG_KIND_DIRECTIVE_LABEL = 0x1009, + BRIG_KIND_DIRECTIVE_LOC = 0x100a, + BRIG_KIND_DIRECTIVE_MODULE = 0x100b, + BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c, + BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d, + BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e, + BRIG_KIND_DIRECTIVE_END = 0x100f, + + BRIG_KIND_INST_BEGIN = 0x2000, + BRIG_KIND_INST_ADDR = 0x2000, + BRIG_KIND_INST_ATOMIC = 0x2001, + BRIG_KIND_INST_BASIC = 0x2002, + BRIG_KIND_INST_BR = 0x2003, + BRIG_KIND_INST_CMP = 0x2004, + BRIG_KIND_INST_CVT = 0x2005, + BRIG_KIND_INST_IMAGE = 0x2006, + BRIG_KIND_INST_LANE = 0x2007, + BRIG_KIND_INST_MEM = 0x2008, + BRIG_KIND_INST_MEM_FENCE = 0x2009, + BRIG_KIND_INST_MOD = 0x200a, + BRIG_KIND_INST_QUERY_IMAGE = 0x200b, + BRIG_KIND_INST_QUERY_SAMPLER = 0x200c, + BRIG_KIND_INST_QUEUE = 0x200d, + BRIG_KIND_INST_SEG = 0x200e, + BRIG_KIND_INST_SEG_CVT = 0x200f, + BRIG_KIND_INST_SIGNAL = 0x2010, + BRIG_KIND_INST_SOURCE_TYPE = 0x2011, + BRIG_KIND_INST_END = 0x2012, + + BRIG_KIND_OPERAND_BEGIN = 0x3000, + BRIG_KIND_OPERAND_ADDRESS = 0x3000, + BRIG_KIND_OPERAND_ALIGN = 0x3001, + BRIG_KIND_OPERAND_CODE_LIST = 0x3002, + BRIG_KIND_OPERAND_CODE_REF = 0x3003, + BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004, + BRIG_KIND_OPERAND_RESERVED = 0x3005, + BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006, + BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007, + BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008, + BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009, + BRIG_KIND_OPERAND_REGISTER = 0x300a, + BRIG_KIND_OPERAND_STRING = 0x300b, + BRIG_KIND_OPERAND_WAVESIZE = 0x300c, + BRIG_KIND_OPERAND_END = 0x300d +}; + +typedef uint8_t BrigAlignment8_t; +enum BrigAlignment { + BRIG_ALIGNMENT_NONE = 0, + BRIG_ALIGNMENT_1 = 1, + BRIG_ALIGNMENT_2 = 2, + BRIG_ALIGNMENT_4 = 3, + BRIG_ALIGNMENT_8 = 4, + BRIG_ALIGNMENT_16 = 5, + BRIG_ALIGNMENT_32 = 6, + BRIG_ALIGNMENT_64 = 7, + BRIG_ALIGNMENT_128 = 8, + BRIG_ALIGNMENT_256 = 9, + BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_256 +}; + +typedef uint8_t BrigAllocation8_t; +enum BrigAllocation { + BRIG_ALLOCATION_NONE = 0, + BRIG_ALLOCATION_PROGRAM = 1, + BRIG_ALLOCATION_AGENT = 2, + BRIG_ALLOCATION_AUTOMATIC = 3 +}; + +typedef uint8_t BrigAluModifier8_t; +enum BrigAluModifierMask { + BRIG_ALU_FTZ = 1 +}; + +typedef uint8_t BrigAtomicOperation8_t; +enum BrigAtomicOperation { + BRIG_ATOMIC_ADD = 0, + BRIG_ATOMIC_AND = 1, + BRIG_ATOMIC_CAS = 2, + BRIG_ATOMIC_EXCH = 3, + BRIG_ATOMIC_LD = 4, + BRIG_ATOMIC_MAX = 5, + BRIG_ATOMIC_MIN = 6, + BRIG_ATOMIC_OR = 7, + BRIG_ATOMIC_ST = 8, + BRIG_ATOMIC_SUB = 9, + BRIG_ATOMIC_WRAPDEC = 10, + BRIG_ATOMIC_WRAPINC = 11, + BRIG_ATOMIC_XOR = 12, + BRIG_ATOMIC_WAIT_EQ = 13, + BRIG_ATOMIC_WAIT_NE = 14, + BRIG_ATOMIC_WAIT_LT = 15, + BRIG_ATOMIC_WAIT_GTE = 16, + BRIG_ATOMIC_WAITTIMEOUT_EQ = 17, + BRIG_ATOMIC_WAITTIMEOUT_NE = 18, + BRIG_ATOMIC_WAITTIMEOUT_LT = 19, + BRIG_ATOMIC_WAITTIMEOUT_GTE = 20 +}; + +typedef uint8_t BrigCompareOperation8_t; +enum BrigCompareOperation { + BRIG_COMPARE_EQ = 0, + BRIG_COMPARE_NE = 1, + BRIG_COMPARE_LT = 2, + BRIG_COMPARE_LE = 3, + BRIG_COMPARE_GT = 4, + BRIG_COMPARE_GE = 5, + BRIG_COMPARE_EQU = 6, + BRIG_COMPARE_NEU = 7, + BRIG_COMPARE_LTU = 8, + BRIG_COMPARE_LEU = 9, + BRIG_COMPARE_GTU = 10, + BRIG_COMPARE_GEU = 11, + BRIG_COMPARE_NUM = 12, + BRIG_COMPARE_NAN = 13, + BRIG_COMPARE_SEQ = 14, + BRIG_COMPARE_SNE = 15, + BRIG_COMPARE_SLT = 16, + BRIG_COMPARE_SLE = 17, + BRIG_COMPARE_SGT = 18, + BRIG_COMPARE_SGE = 19, + BRIG_COMPARE_SGEU = 20, + BRIG_COMPARE_SEQU = 21, + BRIG_COMPARE_SNEU = 22, + BRIG_COMPARE_SLTU = 23, + BRIG_COMPARE_SLEU = 24, + BRIG_COMPARE_SNUM = 25, + BRIG_COMPARE_SNAN = 26, + BRIG_COMPARE_SGTU = 27 +}; + +typedef uint16_t BrigControlDirective16_t; +enum BrigControlDirective { + BRIG_CONTROL_NONE = 0, + BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1, + BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2, + BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3, + BRIG_CONTROL_MAXFLATGRIDSIZE = 4, + BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5, + BRIG_CONTROL_REQUIREDDIM = 6, + BRIG_CONTROL_REQUIREDGRIDSIZE = 7, + BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8, + BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9 +}; + +typedef uint8_t BrigExecutableModifier8_t; +enum BrigExecutableModifierMask { + BRIG_EXECUTABLE_DEFINITION = 1 +}; + +typedef uint8_t BrigImageChannelOrder8_t; +enum BrigImageChannelOrder { + BRIG_CHANNEL_ORDER_A = 0, + BRIG_CHANNEL_ORDER_R = 1, + BRIG_CHANNEL_ORDER_RX = 2, + BRIG_CHANNEL_ORDER_RG = 3, + BRIG_CHANNEL_ORDER_RGX = 4, + BRIG_CHANNEL_ORDER_RA = 5, + BRIG_CHANNEL_ORDER_RGB = 6, + BRIG_CHANNEL_ORDER_RGBX = 7, + BRIG_CHANNEL_ORDER_RGBA = 8, + BRIG_CHANNEL_ORDER_BGRA = 9, + BRIG_CHANNEL_ORDER_ARGB = 10, + BRIG_CHANNEL_ORDER_ABGR = 11, + BRIG_CHANNEL_ORDER_SRGB = 12, + BRIG_CHANNEL_ORDER_SRGBX = 13, + BRIG_CHANNEL_ORDER_SRGBA = 14, + BRIG_CHANNEL_ORDER_SBGRA = 15, + BRIG_CHANNEL_ORDER_INTENSITY = 16, + BRIG_CHANNEL_ORDER_LUMINANCE = 17, + BRIG_CHANNEL_ORDER_DEPTH = 18, + BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19, + + BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigImageChannelType8_t; +enum BrigImageChannelType { + BRIG_CHANNEL_TYPE_SNORM_INT8 = 0, + BRIG_CHANNEL_TYPE_SNORM_INT16 = 1, + BRIG_CHANNEL_TYPE_UNORM_INT8 = 2, + BRIG_CHANNEL_TYPE_UNORM_INT16 = 3, + BRIG_CHANNEL_TYPE_UNORM_INT24 = 4, + BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7, + BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8, + BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9, + BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10, + BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + BRIG_CHANNEL_TYPE_HALF_FLOAT = 14, + BRIG_CHANNEL_TYPE_FLOAT = 15, + + BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigImageGeometry8_t; +enum BrigImageGeometry { + BRIG_GEOMETRY_1D = 0, + BRIG_GEOMETRY_2D = 1, + BRIG_GEOMETRY_3D = 2, + BRIG_GEOMETRY_1DA = 3, + BRIG_GEOMETRY_2DA = 4, + BRIG_GEOMETRY_1DB = 5, + BRIG_GEOMETRY_2DDEPTH = 6, + BRIG_GEOMETRY_2DADEPTH = 7, + + BRIG_GEOMETRY_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigImageQuery8_t; +enum BrigImageQuery { + BRIG_IMAGE_QUERY_WIDTH = 0, + BRIG_IMAGE_QUERY_HEIGHT = 1, + BRIG_IMAGE_QUERY_DEPTH = 2, + BRIG_IMAGE_QUERY_ARRAY = 3, + BRIG_IMAGE_QUERY_CHANNELORDER = 4, + BRIG_IMAGE_QUERY_CHANNELTYPE = 5, + + BRIG_IMAGE_QUERY_FIRST_USER_DEFINED = 6 +}; + +typedef uint8_t BrigLinkage8_t; +enum BrigLinkage { + BRIG_LINKAGE_NONE = 0, + BRIG_LINKAGE_PROGRAM = 1, + BRIG_LINKAGE_MODULE = 2, + BRIG_LINKAGE_FUNCTION = 3, + BRIG_LINKAGE_ARG = 4 +}; + +typedef uint8_t BrigMachineModel8_t; +enum BrigMachineModel { + BRIG_MACHINE_SMALL = 0, + BRIG_MACHINE_LARGE = 1, +}; + +typedef uint8_t BrigMemoryModifier8_t; +enum BrigMemoryModifierMask { + BRIG_MEMORY_CONST = 1 +}; + +typedef uint8_t BrigMemoryOrder8_t; +enum BrigMemoryOrder { + BRIG_MEMORY_ORDER_NONE = 0, + BRIG_MEMORY_ORDER_RELAXED = 1, + BRIG_MEMORY_ORDER_SC_ACQUIRE = 2, + BRIG_MEMORY_ORDER_SC_RELEASE = 3, + BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4, +}; + +typedef uint8_t BrigMemoryScope8_t; +enum BrigMemoryScope { + BRIG_MEMORY_SCOPE_NONE = 0, + BRIG_MEMORY_SCOPE_WORKITEM = 1, + BRIG_MEMORY_SCOPE_WAVEFRONT = 2, + BRIG_MEMORY_SCOPE_WORKGROUP = 3, + BRIG_MEMORY_SCOPE_AGENT = 4, + BRIG_MEMORY_SCOPE_SYSTEM = 5, +}; + +typedef uint16_t BrigOpcode16_t; +enum BrigOpcode { + BRIG_OPCODE_NOP = 0, + BRIG_OPCODE_ABS = 1, + BRIG_OPCODE_ADD = 2, + BRIG_OPCODE_BORROW = 3, + BRIG_OPCODE_CARRY = 4, + BRIG_OPCODE_CEIL = 5, + BRIG_OPCODE_COPYSIGN = 6, + BRIG_OPCODE_DIV = 7, + BRIG_OPCODE_FLOOR = 8, + BRIG_OPCODE_FMA = 9, + BRIG_OPCODE_FRACT = 10, + BRIG_OPCODE_MAD = 11, + BRIG_OPCODE_MAX = 12, + BRIG_OPCODE_MIN = 13, + BRIG_OPCODE_MUL = 14, + BRIG_OPCODE_MULHI = 15, + BRIG_OPCODE_NEG = 16, + BRIG_OPCODE_REM = 17, + BRIG_OPCODE_RINT = 18, + BRIG_OPCODE_SQRT = 19, + BRIG_OPCODE_SUB = 20, + BRIG_OPCODE_TRUNC = 21, + BRIG_OPCODE_MAD24 = 22, + BRIG_OPCODE_MAD24HI = 23, + BRIG_OPCODE_MUL24 = 24, + BRIG_OPCODE_MUL24HI = 25, + BRIG_OPCODE_SHL = 26, + BRIG_OPCODE_SHR = 27, + BRIG_OPCODE_AND = 28, + BRIG_OPCODE_NOT = 29, + BRIG_OPCODE_OR = 30, + BRIG_OPCODE_POPCOUNT = 31, + BRIG_OPCODE_XOR = 32, + BRIG_OPCODE_BITEXTRACT = 33, + BRIG_OPCODE_BITINSERT = 34, + BRIG_OPCODE_BITMASK = 35, + BRIG_OPCODE_BITREV = 36, + BRIG_OPCODE_BITSELECT = 37, + BRIG_OPCODE_FIRSTBIT = 38, + BRIG_OPCODE_LASTBIT = 39, + BRIG_OPCODE_COMBINE = 40, + BRIG_OPCODE_EXPAND = 41, + BRIG_OPCODE_LDA = 42, + BRIG_OPCODE_MOV = 43, + BRIG_OPCODE_SHUFFLE = 44, + BRIG_OPCODE_UNPACKHI = 45, + BRIG_OPCODE_UNPACKLO = 46, + BRIG_OPCODE_PACK = 47, + BRIG_OPCODE_UNPACK = 48, + BRIG_OPCODE_CMOV = 49, + BRIG_OPCODE_CLASS = 50, + BRIG_OPCODE_NCOS = 51, + BRIG_OPCODE_NEXP2 = 52, + BRIG_OPCODE_NFMA = 53, + BRIG_OPCODE_NLOG2 = 54, + BRIG_OPCODE_NRCP = 55, + BRIG_OPCODE_NRSQRT = 56, + BRIG_OPCODE_NSIN = 57, + BRIG_OPCODE_NSQRT = 58, + BRIG_OPCODE_BITALIGN = 59, + BRIG_OPCODE_BYTEALIGN = 60, + BRIG_OPCODE_PACKCVT = 61, + BRIG_OPCODE_UNPACKCVT = 62, + BRIG_OPCODE_LERP = 63, + BRIG_OPCODE_SAD = 64, + BRIG_OPCODE_SADHI = 65, + BRIG_OPCODE_SEGMENTP = 66, + BRIG_OPCODE_FTOS = 67, + BRIG_OPCODE_STOF = 68, + BRIG_OPCODE_CMP = 69, + BRIG_OPCODE_CVT = 70, + BRIG_OPCODE_LD = 71, + BRIG_OPCODE_ST = 72, + BRIG_OPCODE_ATOMIC = 73, + BRIG_OPCODE_ATOMICNORET = 74, + BRIG_OPCODE_SIGNAL = 75, + BRIG_OPCODE_SIGNALNORET = 76, + BRIG_OPCODE_MEMFENCE = 77, + BRIG_OPCODE_RDIMAGE = 78, + BRIG_OPCODE_LDIMAGE = 79, + BRIG_OPCODE_STIMAGE = 80, + BRIG_OPCODE_IMAGEFENCE = 81, + BRIG_OPCODE_QUERYIMAGE = 82, + BRIG_OPCODE_QUERYSAMPLER = 83, + BRIG_OPCODE_CBR = 84, + BRIG_OPCODE_BR = 85, + BRIG_OPCODE_SBR = 86, + BRIG_OPCODE_BARRIER = 87, + BRIG_OPCODE_WAVEBARRIER = 88, + BRIG_OPCODE_ARRIVEFBAR = 89, + BRIG_OPCODE_INITFBAR = 90, + BRIG_OPCODE_JOINFBAR = 91, + BRIG_OPCODE_LEAVEFBAR = 92, + BRIG_OPCODE_RELEASEFBAR = 93, + BRIG_OPCODE_WAITFBAR = 94, + BRIG_OPCODE_LDF = 95, + BRIG_OPCODE_ACTIVELANECOUNT = 96, + BRIG_OPCODE_ACTIVELANEID = 97, + BRIG_OPCODE_ACTIVELANEMASK = 98, + BRIG_OPCODE_ACTIVELANEPERMUTE = 99, + BRIG_OPCODE_CALL = 100, + BRIG_OPCODE_SCALL = 101, + BRIG_OPCODE_ICALL = 102, + BRIG_OPCODE_RET = 103, + BRIG_OPCODE_ALLOCA = 104, + BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105, + BRIG_OPCODE_CURRENTWORKITEMFLATID = 106, + BRIG_OPCODE_DIM = 107, + BRIG_OPCODE_GRIDGROUPS = 108, + BRIG_OPCODE_GRIDSIZE = 109, + BRIG_OPCODE_PACKETCOMPLETIONSIG = 110, + BRIG_OPCODE_PACKETID = 111, + BRIG_OPCODE_WORKGROUPID = 112, + BRIG_OPCODE_WORKGROUPSIZE = 113, + BRIG_OPCODE_WORKITEMABSID = 114, + BRIG_OPCODE_WORKITEMFLATABSID = 115, + BRIG_OPCODE_WORKITEMFLATID = 116, + BRIG_OPCODE_WORKITEMID = 117, + BRIG_OPCODE_CLEARDETECTEXCEPT = 118, + BRIG_OPCODE_GETDETECTEXCEPT = 119, + BRIG_OPCODE_SETDETECTEXCEPT = 120, + BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121, + BRIG_OPCODE_CASQUEUEWRITEINDEX = 122, + BRIG_OPCODE_LDQUEUEREADINDEX = 123, + BRIG_OPCODE_LDQUEUEWRITEINDEX = 124, + BRIG_OPCODE_STQUEUEREADINDEX = 125, + BRIG_OPCODE_STQUEUEWRITEINDEX = 126, + BRIG_OPCODE_CLOCK = 127, + BRIG_OPCODE_CUID = 128, + BRIG_OPCODE_DEBUGTRAP = 129, + BRIG_OPCODE_GROUPBASEPTR = 130, + BRIG_OPCODE_KERNARGBASEPTR = 131, + BRIG_OPCODE_LANEID = 132, + BRIG_OPCODE_MAXCUID = 133, + BRIG_OPCODE_MAXWAVEID = 134, + BRIG_OPCODE_NULLPTR = 135, + BRIG_OPCODE_WAVEID = 136, + + BRIG_OPCODE_FIRST_USER_DEFINED = 32768, +}; + +typedef uint8_t BrigPack8_t; +enum BrigPack { + BRIG_PACK_NONE = 0, + BRIG_PACK_PP = 1, + BRIG_PACK_PS = 2, + BRIG_PACK_SP = 3, + BRIG_PACK_SS = 4, + BRIG_PACK_S = 5, + BRIG_PACK_P = 6, + BRIG_PACK_PPSAT = 7, + BRIG_PACK_PSSAT = 8, + BRIG_PACK_SPSAT = 9, + BRIG_PACK_SSSAT = 10, + BRIG_PACK_SSAT = 11, + BRIG_PACK_PSAT = 12 +}; + +typedef uint8_t BrigProfile8_t; +enum BrigProfile { + BRIG_PROFILE_BASE = 0, + BRIG_PROFILE_FULL = 1, +}; + +typedef uint16_t BrigRegisterKind16_t; +enum BrigRegisterKind { + BRIG_REGISTER_KIND_CONTROL = 0, + BRIG_REGISTER_KIND_SINGLE = 1, + BRIG_REGISTER_KIND_DOUBLE = 2, + BRIG_REGISTER_KIND_QUAD = 3 +}; + +typedef uint8_t BrigRound8_t; +enum BrigRound { + BRIG_ROUND_NONE = 0, + BRIG_ROUND_FLOAT_DEFAULT = 1, + BRIG_ROUND_FLOAT_NEAR_EVEN = 2, + BRIG_ROUND_FLOAT_ZERO = 3, + BRIG_ROUND_FLOAT_PLUS_INFINITY = 4, + BRIG_ROUND_FLOAT_MINUS_INFINITY = 5, + BRIG_ROUND_INTEGER_NEAR_EVEN = 6, + BRIG_ROUND_INTEGER_ZERO = 7, + BRIG_ROUND_INTEGER_PLUS_INFINITY = 8, + BRIG_ROUND_INTEGER_MINUS_INFINITY = 9, + BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10, + BRIG_ROUND_INTEGER_ZERO_SAT = 11, + BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12, + BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13, + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14, + BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15, + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16, + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17, + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18, + BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19, + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20, + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21 +}; + +typedef uint8_t BrigSamplerAddressing8_t; +enum BrigSamplerAddressing { + BRIG_ADDRESSING_UNDEFINED = 0, + BRIG_ADDRESSING_CLAMP_TO_EDGE = 1, + BRIG_ADDRESSING_CLAMP_TO_BORDER = 2, + BRIG_ADDRESSING_REPEAT = 3, + BRIG_ADDRESSING_MIRRORED_REPEAT = 4, + + BRIG_ADDRESSING_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigSamplerCoordNormalization8_t; +enum BrigSamplerCoordNormalization { + BRIG_COORD_UNNORMALIZED = 0, + BRIG_COORD_NORMALIZED = 1 +}; + +typedef uint8_t BrigSamplerFilter8_t; +enum BrigSamplerFilter { + BRIG_FILTER_NEAREST = 0, + BRIG_FILTER_LINEAR = 1, + + BRIG_FILTER_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigSamplerQuery8_t; +enum BrigSamplerQuery { + BRIG_SAMPLER_QUERY_ADDRESSING = 0, + BRIG_SAMPLER_QUERY_COORD = 1, + BRIG_SAMPLER_QUERY_FILTER = 2 +}; + +typedef uint32_t BrigSectionIndex32_t; +enum BrigSectionIndex { + BRIG_SECTION_INDEX_DATA = 0, + BRIG_SECTION_INDEX_CODE = 1, + BRIG_SECTION_INDEX_OPERAND = 2, + + BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3, +}; + +typedef uint8_t BrigSegCvtModifier8_t; +enum BrigSegCvtModifierMask { + BRIG_SEG_CVT_NONULL = 1 +}; + +typedef uint8_t BrigSegment8_t; +enum BrigSegment { + BRIG_SEGMENT_NONE = 0, + BRIG_SEGMENT_FLAT = 1, + BRIG_SEGMENT_GLOBAL = 2, + BRIG_SEGMENT_READONLY = 3, + BRIG_SEGMENT_KERNARG = 4, + BRIG_SEGMENT_GROUP = 5, + BRIG_SEGMENT_PRIVATE = 6, + BRIG_SEGMENT_SPILL = 7, + BRIG_SEGMENT_ARG = 8, + + BRIG_SEGMENT_FIRST_USER_DEFINED = 128 +}; + +enum { + BRIG_TYPE_BASE_SIZE = 5, + BRIG_TYPE_PACK_SIZE = 2, + BRIG_TYPE_ARRAY_SIZE = 1, + + BRIG_TYPE_BASE_SHIFT = 0, + BRIG_TYPE_PACK_SHIFT = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE, + BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE, + + BRIG_TYPE_BASE_MASK = ((1 << BRIG_TYPE_BASE_SIZE) - 1) << BRIG_TYPE_BASE_SHIFT, + BRIG_TYPE_PACK_MASK = ((1 << BRIG_TYPE_PACK_SIZE) - 1) << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT, + + BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_32 = 1 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_64 = 2 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_128 = 3 << BRIG_TYPE_PACK_SHIFT, + + BRIG_TYPE_ARRAY = 1 << BRIG_TYPE_ARRAY_SHIFT +}; + +typedef uint16_t BrigType16_t; +enum BrigType { + BRIG_TYPE_NONE = 0, + BRIG_TYPE_U8 = 1, + BRIG_TYPE_U16 = 2, + BRIG_TYPE_U32 = 3, + BRIG_TYPE_U64 = 4, + BRIG_TYPE_S8 = 5, + BRIG_TYPE_S16 = 6, + BRIG_TYPE_S32 = 7, + BRIG_TYPE_S64 = 8, + BRIG_TYPE_F16 = 9, + BRIG_TYPE_F32 = 10, + BRIG_TYPE_F64 = 11, + BRIG_TYPE_B1 = 12, + BRIG_TYPE_B8 = 13, + BRIG_TYPE_B16 = 14, + BRIG_TYPE_B32 = 15, + BRIG_TYPE_B64 = 16, + BRIG_TYPE_B128 = 17, + BRIG_TYPE_SAMP = 18, + BRIG_TYPE_ROIMG = 19, + BRIG_TYPE_WOIMG = 20, + BRIG_TYPE_RWIMG = 21, + BRIG_TYPE_SIG32 = 22, + BRIG_TYPE_SIG64 = 23, + + BRIG_TYPE_U8X4 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_32, + BRIG_TYPE_U8X8 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_64, + BRIG_TYPE_U8X16 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_128, + BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32, + BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64, + BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128, + BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64, + BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128, + BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S8X4 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_32, + BRIG_TYPE_S8X8 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_64, + BRIG_TYPE_S8X16 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32, + BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64, + BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64, + BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128, + BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32, + BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64, + BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128, + BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64, + BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128, + BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128, + + BRIG_TYPE_U8_ARRAY = BRIG_TYPE_U8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16_ARRAY = BRIG_TYPE_U16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U32_ARRAY = BRIG_TYPE_U32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U64_ARRAY = BRIG_TYPE_U64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8_ARRAY = BRIG_TYPE_S8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16_ARRAY = BRIG_TYPE_S16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S32_ARRAY = BRIG_TYPE_S32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S64_ARRAY = BRIG_TYPE_S64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16_ARRAY = BRIG_TYPE_F16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F32_ARRAY = BRIG_TYPE_F32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F64_ARRAY = BRIG_TYPE_F64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B8_ARRAY = BRIG_TYPE_B8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B16_ARRAY = BRIG_TYPE_B16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B32_ARRAY = BRIG_TYPE_B32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B64_ARRAY = BRIG_TYPE_B64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B128_ARRAY = BRIG_TYPE_B128 | BRIG_TYPE_ARRAY, + BRIG_TYPE_SAMP_ARRAY = BRIG_TYPE_SAMP | BRIG_TYPE_ARRAY, + BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY, + BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY, + BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY, + BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U8X4_ARRAY = BRIG_TYPE_U8X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U8X8_ARRAY = BRIG_TYPE_U8X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8X4_ARRAY = BRIG_TYPE_S8X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8X8_ARRAY = BRIG_TYPE_S8X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY, +}; + +typedef uint8_t BrigVariableModifier8_t; +enum BrigVariableModifierMask { + BRIG_VARIABLE_DEFINITION = 1, + BRIG_VARIABLE_CONST = 2 +}; + +typedef uint8_t BrigWidth8_t; +enum BrigWidth { + BRIG_WIDTH_NONE = 0, + BRIG_WIDTH_1 = 1, + BRIG_WIDTH_2 = 2, + BRIG_WIDTH_4 = 3, + BRIG_WIDTH_8 = 4, + BRIG_WIDTH_16 = 5, + BRIG_WIDTH_32 = 6, + BRIG_WIDTH_64 = 7, + BRIG_WIDTH_128 = 8, + BRIG_WIDTH_256 = 9, + BRIG_WIDTH_512 = 10, + BRIG_WIDTH_1024 = 11, + BRIG_WIDTH_2048 = 12, + BRIG_WIDTH_4096 = 13, + BRIG_WIDTH_8192 = 14, + BRIG_WIDTH_16384 = 15, + BRIG_WIDTH_32768 = 16, + BRIG_WIDTH_65536 = 17, + BRIG_WIDTH_131072 = 18, + BRIG_WIDTH_262144 = 19, + BRIG_WIDTH_524288 = 20, + BRIG_WIDTH_1048576 = 21, + BRIG_WIDTH_2097152 = 22, + BRIG_WIDTH_4194304 = 23, + BRIG_WIDTH_8388608 = 24, + BRIG_WIDTH_16777216 = 25, + BRIG_WIDTH_33554432 = 26, + BRIG_WIDTH_67108864 = 27, + BRIG_WIDTH_134217728 = 28, + BRIG_WIDTH_268435456 = 29, + BRIG_WIDTH_536870912 = 30, + BRIG_WIDTH_1073741824 = 31, + BRIG_WIDTH_2147483648 = 32, + BRIG_WIDTH_WAVESIZE = 33, + BRIG_WIDTH_ALL = 34, +}; + +struct BrigUInt64 { + uint32_t lo; + uint32_t hi; +}; + +struct BrigBase { + uint16_t byteCount; + BrigKind16_t kind; +}; + +struct BrigData { + uint32_t byteCount; + uint8_t bytes[1]; +}; + +struct BrigDirectiveArgBlock { + BrigBase base; +}; + +struct BrigDirectiveComment { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveControl { + BrigBase base; + BrigControlDirective16_t control; + uint16_t reserved; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveExecutable { + BrigBase base; + BrigDataOffsetString32_t name; + uint16_t outArgCount; + uint16_t inArgCount; + BrigCodeOffset32_t firstInArg; + BrigCodeOffset32_t firstCodeBlockEntry; + BrigCodeOffset32_t nextModuleEntry; + BrigExecutableModifier8_t modifier; + BrigLinkage8_t linkage; + uint16_t reserved; +}; + +struct BrigDirectiveExtension { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveFbarrier { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVariableModifier8_t modifier; + BrigLinkage8_t linkage; + uint16_t reserved; +}; + +struct BrigDirectiveLabel { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveLoc { + BrigBase base; + BrigDataOffsetString32_t filename; + uint32_t line; + uint32_t column; +}; + +struct BrigDirectiveNone { + BrigBase base; +}; + +struct BrigDirectivePragma { + BrigBase base; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveVariable { + BrigBase base; + BrigDataOffsetString32_t name; + BrigOperandOffset32_t init; + BrigType16_t type; + BrigSegment8_t segment; + BrigAlignment8_t align; + BrigUInt64 dim; + BrigVariableModifier8_t modifier; + BrigLinkage8_t linkage; + BrigAllocation8_t allocation; + uint8_t reserved; +}; + +struct BrigDirectiveModule { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVersion32_t hsailMajor; + BrigVersion32_t hsailMinor; + BrigProfile8_t profile; + BrigMachineModel8_t machineModel; + BrigRound8_t defaultFloatRound; + uint8_t reserved; +}; + +struct BrigInstBase { + BrigBase base; + BrigOpcode16_t opcode; + BrigType16_t type; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigInstAddr { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; +}; + +struct BrigInstAtomic { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t memoryScope; + BrigAtomicOperation8_t atomicOperation; + uint8_t equivClass; + uint8_t reserved[3]; +}; + +struct BrigInstBasic { + BrigInstBase base; +}; + +struct BrigInstBr { + BrigInstBase base; + BrigWidth8_t width; + uint8_t reserved[3]; +}; + +struct BrigInstCmp { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier8_t modifier; + BrigCompareOperation8_t compare; + BrigPack8_t pack; + uint8_t reserved[3]; +}; + +struct BrigInstCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier8_t modifier; + BrigRound8_t round; +}; + +struct BrigInstImage { + BrigInstBase base; + BrigType16_t imageType; + BrigType16_t coordType; + BrigImageGeometry8_t geometry; + uint8_t equivClass; + uint16_t reserved; +}; + +struct BrigInstLane { + BrigInstBase base; + BrigType16_t sourceType; + BrigWidth8_t width; + uint8_t reserved; +}; + +struct BrigInstMem { + BrigInstBase base; + BrigSegment8_t segment; + BrigAlignment8_t align; + uint8_t equivClass; + BrigWidth8_t width; + BrigMemoryModifier8_t modifier; + uint8_t reserved[3]; +}; + +struct BrigInstMemFence { + BrigInstBase base; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t globalSegmentMemoryScope; + BrigMemoryScope8_t groupSegmentMemoryScope; + BrigMemoryScope8_t imageSegmentMemoryScope; +}; + +struct BrigInstMod { + BrigInstBase base; + BrigAluModifier8_t modifier; + BrigRound8_t round; + BrigPack8_t pack; + uint8_t reserved; +}; + +struct BrigInstQueryImage { + BrigInstBase base; + BrigType16_t imageType; + BrigImageGeometry8_t geometry; + BrigImageQuery8_t query; +}; + +struct BrigInstQuerySampler { + BrigInstBase base; + BrigSamplerQuery8_t query; + uint8_t reserved[3]; +}; + +struct BrigInstQueue { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + uint16_t reserved; +}; + +struct BrigInstSeg { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; +}; + +struct BrigInstSegCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigSegment8_t segment; + BrigSegCvtModifier8_t modifier; +}; + +struct BrigInstSignal { + BrigInstBase base; + BrigType16_t signalType; + BrigMemoryOrder8_t memoryOrder; + BrigAtomicOperation8_t signalOperation; +}; + +struct BrigInstSourceType { + BrigInstBase base; + BrigType16_t sourceType; + uint16_t reserved; +}; + +struct BrigOperandAddress { + BrigBase base; + BrigCodeOffset32_t symbol; + BrigOperandOffset32_t reg; + BrigUInt64 offset; +}; + +struct BrigOperandAlign { + BrigBase base; + BrigAlignment8_t align; + uint8_t reserved[3]; +}; + +struct BrigOperandCodeList { + BrigBase base; + BrigDataOffsetCodeList32_t elements; +}; + +struct BrigOperandCodeRef { + BrigBase base; + BrigCodeOffset32_t ref; +}; + +struct BrigOperandConstantBytes { + BrigBase base; + BrigType16_t type; + uint16_t reserved; + BrigDataOffsetString32_t bytes; +}; + +struct BrigOperandConstantOperandList { + BrigBase base; + BrigType16_t type; + uint16_t reserved; + BrigDataOffsetOperandList32_t elements; +}; + +struct BrigOperandConstantImage { + BrigBase base; + BrigType16_t type; + BrigImageGeometry8_t geometry; + BrigImageChannelOrder8_t channelOrder; + BrigImageChannelType8_t channelType; + uint8_t reserved[3]; + BrigUInt64 width; + BrigUInt64 height; + BrigUInt64 depth; + BrigUInt64 array; +}; + +struct BrigOperandOperandList { + BrigBase base; + BrigDataOffsetOperandList32_t elements; +}; + +struct BrigOperandRegister { + BrigBase base; + BrigRegisterKind16_t regKind; + uint16_t regNum; +}; + +struct BrigOperandConstantSampler { + BrigBase base; + BrigType16_t type; + BrigSamplerCoordNormalization8_t coord; + BrigSamplerFilter8_t filter; + BrigSamplerAddressing8_t addressing; + uint8_t reserved[3]; +}; + +struct BrigOperandString { + BrigBase base; + BrigDataOffsetString32_t string; +}; + +struct BrigOperandWavesize { + BrigBase base; +}; + +typedef uint32_t BrigExceptions32_t; +enum BrigExceptionsMask { + BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0, + BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1, + BRIG_EXCEPTIONS_OVERFLOW = 1 << 2, + BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3, + BRIG_EXCEPTIONS_INEXACT = 1 << 4, + + BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16 +}; + +struct BrigSectionHeader { + uint64_t byteCount; + uint32_t headerByteCount; + uint32_t nameLength; + uint8_t name[1]; +}; + +struct BrigModuleHeader { + char identification[8]; + BrigVersion32_t brigMajor; + BrigVersion32_t brigMinor; + uint64_t byteCount; + uint8_t hash[64]; + uint32_t reserved; + uint32_t sectionCount; + uint64_t sectionIndex; +}; + +typedef BrigModuleHeader* BrigModule_t; + +#ifdef __cplusplus +} +#endif /*__cplusplus*/ + +#endif // defined(INCLUDED_BRIG_H) diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_common.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_common.h new file mode 100644 index 0000000000..7c4ed3eea4 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_common.h @@ -0,0 +1,91 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// The following set of header files provides definitions for AMD GPU +// Architecture: +// - amd_hsa_common.h +// - amd_hsa_elf.h +// - amd_hsa_kernel_code.h +// - amd_hsa_queue.h +// - amd_hsa_signal.h +// +// Refer to "HSA Application Binary Interface: AMD GPU Architecture" for more +// information. + +#ifndef AMD_HSA_COMMON_H +#define AMD_HSA_COMMON_H + +#include +#include + +// Descriptive version of the HSA Application Binary Interface. +#define AMD_HSA_ABI_VERSION "AMD GPU Architecture v0.35 (June 25, 2015)" + +// Alignment attribute that specifies a minimum alignment (in bytes) for +// variables of the specified type. +#if defined(__GNUC__) +# define __ALIGNED__(x) __attribute__((aligned(x))) +#elif defined(_MSC_VER) +# define __ALIGNED__(x) __declspec(align(x)) +#elif defined(RC_INVOKED) +# define __ALIGNED__(x) +#else +# error +#endif + +// Creates enumeration entries for packed types. Enumeration entries include +// bit shift amount, bit width, and bit mask. +#define AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width) \ + name##_SHIFT = (shift), \ + name##_WIDTH = (width), \ + name = (((1 << (width)) - 1) << (shift)) \ + +// Gets bits for specified mask from specified src packed instance. +#define AMD_HSA_BITS_GET(src, mask) \ + ((src & mask) >> mask ## _SHIFT) \ + +// Sets val bits for specified mask in specified dst packed instance. +#define AMD_HSA_BITS_SET(dst, mask, val) \ + dst &= (~(1 << mask##_SHIFT) & ~mask); \ + dst |= (((val) << mask##_SHIFT) & mask) \ + +#endif // AMD_HSA_COMMON_H diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_elf.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_elf.h new file mode 100644 index 0000000000..2b6c4c9672 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_elf.h @@ -0,0 +1,467 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Undefine the macro in case it is defined in the system elf.h. +#undef EM_AMDGPU + +#ifndef AMD_HSA_ELF_H +#define AMD_HSA_ELF_H + +// AMD GPU Specific ELF Header Enumeration Values. +// +// Values are copied from LLVM BinaryFormat/ELF.h . This file also contains +// code object V1 defintions which are not part of the LLVM header. Code object +// V1 was only supported by the Finalizer which is now deprecated and removed. +// +// TODO: Deprecate and remove V1 support and replace this header with using the +// LLVM header. +namespace ELF { + +// Machine architectures +// See current registered ELF machine architectures at: +// http://www.uxsglobal.com/developers/gabi/latest/ch4.eheader.html +enum { + EM_AMDGPU = 224, // AMD GPU architecture +}; + +// OS ABI identification. +enum { + ELFOSABI_AMDGPU_HSA = 64, // AMD HSA runtime +}; + +// AMDGPU OS ABI Version identification. +enum { + // ELFABIVERSION_AMDGPU_HSA_V1 does not exist because OS ABI identification + // was never defined for V1. + ELFABIVERSION_AMDGPU_HSA_V2 = 0, + ELFABIVERSION_AMDGPU_HSA_V3 = 1, + ELFABIVERSION_AMDGPU_HSA_V4 = 2, + ELFABIVERSION_AMDGPU_HSA_V5 = 3, + ELFABIVERSION_AMDGPU_HSA_V6 = 4, +}; + +// AMDGPU specific e_flags. +enum : unsigned { + // Processor selection mask for EF_AMDGPU_MACH_* values. + EF_AMDGPU_MACH = 0x0ff, + + // Not specified processor. + EF_AMDGPU_MACH_NONE = 0x000, + + // AMDGCN-based processors. + // clang-format off + EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020, + EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021, + EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022, + EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023, + EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024, + EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025, + EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27 = 0x027, + EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028, + EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029, + EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a, + EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b, + EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c, + EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d, + EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e, + EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, + EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030, + EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031, + EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032, + EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, + EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034, + EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035, + EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036, + EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037, + EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038, + EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039, + EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a, + EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b, + EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c, + EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d, + EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e, + EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, + EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040, + EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, + EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042, + EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043, + EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044, + EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045, + EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046, + EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047, + EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49 = 0x049, + EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a, + EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b, + EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d, + EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e, + EF_AMDGPU_MACH_AMDGCN_GFX950 = 0x04f, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050, + EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051, + EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052, + EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC = 0x053, + EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC = 0x054, + EF_AMDGPU_MACH_AMDGCN_GFX1152 = 0x055, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X56 = 0x056, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57 = 0x057, + EF_AMDGPU_MACH_AMDGCN_GFX1153 = 0x058, + EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC = 0x059, + EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC = 0x05f, + // clang-format on + + // First/last AMDGCN-based processors. + EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, + + // Indicates if the "xnack" target feature is enabled for all code contained + // in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2. + EF_AMDGPU_FEATURE_XNACK_V2 = 0x01, + // Indicates if the trap handler is enabled for all code contained + // in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2. + EF_AMDGPU_FEATURE_TRAP_HANDLER_V2 = 0x02, + + // Indicates if the "xnack" target feature is enabled for all code contained + // in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3. + EF_AMDGPU_FEATURE_XNACK_V3 = 0x100, + // Indicates if the "sramecc" target feature is enabled for all code + // contained in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3. + EF_AMDGPU_FEATURE_SRAMECC_V3 = 0x200, + + // XNACK selection mask for EF_AMDGPU_FEATURE_XNACK_* values. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4. + EF_AMDGPU_FEATURE_XNACK_V4 = 0x300, + // XNACK is not supported. + EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000, + // XNACK is any/default/unspecified. + EF_AMDGPU_FEATURE_XNACK_ANY_V4 = 0x100, + // XNACK is off. + EF_AMDGPU_FEATURE_XNACK_OFF_V4 = 0x200, + // XNACK is on. + EF_AMDGPU_FEATURE_XNACK_ON_V4 = 0x300, + + // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4. + EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00, + // SRAMECC is not supported. + EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000, + // SRAMECC is any/default/unspecified. + EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 = 0x400, + // SRAMECC is off. + EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800, + // SRAMECC is on. + EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00, + + // Generic target versioning. This is contained in the list byte of EFLAGS. + EF_AMDGPU_GENERIC_VERSION = 0xff000000, + EF_AMDGPU_GENERIC_VERSION_OFFSET = 24, + EF_AMDGPU_GENERIC_VERSION_MIN = 1, + EF_AMDGPU_GENERIC_VERSION_MAX = 0xff, +}; + +// ELF Relocation types for AMDGPU. +enum : unsigned { + R_AMDGPU_ABS32_LO = 1, + R_AMDGPU_ABS32_HI = 2, + R_AMDGPU_ABS64 = 3, + R_AMDGPU_ABS32 = 6, + R_AMDGPU_RELATIVE64 = 13, +}; + +} // end namespace ELF + +// ELF Section Header Flag Enumeration Values. +#define SHF_AMDGPU_HSA_GLOBAL (0x00100000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_READONLY (0x00200000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_CODE (0x00400000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_AGENT (0x00800000 & SHF_MASKOS) + +// +typedef enum { + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM = 0, + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT = 1, + AMDGPU_HSA_SEGMENT_READONLY_AGENT = 2, + AMDGPU_HSA_SEGMENT_CODE_AGENT = 3, + AMDGPU_HSA_SEGMENT_LAST, +} amdgpu_hsa_elf_segment_t; + +// ELF Program Header Type Enumeration Values. +#define PT_AMDGPU_HSA_LOAD_GLOBAL_PROGRAM (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM) +#define PT_AMDGPU_HSA_LOAD_GLOBAL_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT) +#define PT_AMDGPU_HSA_LOAD_READONLY_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_READONLY_AGENT) +#define PT_AMDGPU_HSA_LOAD_CODE_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_CODE_AGENT) + +// ELF Symbol Type Enumeration Values. +#define STT_AMDGPU_HSA_KERNEL (STT_LOOS + 0) +#define STT_AMDGPU_HSA_INDIRECT_FUNCTION (STT_LOOS + 1) +#define STT_AMDGPU_HSA_METADATA (STT_LOOS + 2) + +// ELF Symbol Binding Enumeration Values. +#define STB_AMDGPU_HSA_EXTERNAL (STB_LOOS + 0) + +// ELF Symbol Other Information Creation/Retrieval. +#define ELF64_ST_AMDGPU_ALLOCATION(o) (((o) >> 2) & 0x3) +#define ELF64_ST_AMDGPU_FLAGS(o) ((o) >> 4) +#define ELF64_ST_AMDGPU_OTHER(f, a, v) (((f) << 4) + (((a) & 0x3) << 2) + ((v) & 0x3)) + +typedef enum { + AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT = 0, + AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM = 1, + AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT = 2, + AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT = 3, + AMDGPU_HSA_SYMBOL_ALLOCATION_LAST, +} amdgpu_hsa_symbol_allocation_t; + +// ELF Symbol Allocation Enumeration Values. +#define STA_AMDGPU_HSA_DEFAULT AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT +#define STA_AMDGPU_HSA_GLOBAL_PROGRAM AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM +#define STA_AMDGPU_HSA_GLOBAL_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT +#define STA_AMDGPU_HSA_READONLY_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT + +typedef enum { + AMDGPU_HSA_SYMBOL_FLAG_DEFAULT = 0, + AMDGPU_HSA_SYMBOL_FLAG_CONST = 1, + AMDGPU_HSA_SYMBOL_FLAG_LAST, +} amdgpu_hsa_symbol_flag_t; + +// ELF Symbol Flag Enumeration Values. +#define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST + +// Legacy/V1 AMD GPU Relocation Type Enumeration Values. +#define R_AMDGPU_V1_NONE 0 +#define R_AMDGPU_V1_32_LOW 1 +#define R_AMDGPU_V1_32_HIGH 2 +#define R_AMDGPU_V1_64 3 +#define R_AMDGPU_V1_INIT_SAMPLER 4 +#define R_AMDGPU_V1_INIT_IMAGE 5 +#define R_AMDGPU_V1_RELATIVE64 13 + +// AMD GPU Note Type Enumeration Values. +#define NT_AMD_HSA_CODE_OBJECT_VERSION 1 +#define NT_AMD_HSA_HSAIL 2 +#define NT_AMD_HSA_ISA_VERSION 3 +#define NT_AMD_HSA_PRODUCER 4 +#define NT_AMD_HSA_PRODUCER_OPTIONS 5 +#define NT_AMD_HSA_EXTENSION 6 +#define NT_AMD_HSA_ISA_NAME 11 +/* AMDGPU snapshots of runtime, agent and queues state for use in core dump */ +#define NT_AMDGPU_CORE_STATE 33 +#define NT_AMD_HSA_HLDEBUG_DEBUG 101 +#define NT_AMD_HSA_HLDEBUG_TARGET 102 + +// AMD GPU Metadata Kind Enumeration Values. +typedef uint16_t amdgpu_hsa_metadata_kind16_t; +typedef enum { + AMDGPU_HSA_METADATA_KIND_NONE = 0, + AMDGPU_HSA_METADATA_KIND_INIT_SAMP = 1, + AMDGPU_HSA_METADATA_KIND_INIT_ROIMG = 2, + AMDGPU_HSA_METADATA_KIND_INIT_WOIMG = 3, + AMDGPU_HSA_METADATA_KIND_INIT_RWIMG = 4 +} amdgpu_hsa_metadata_kind_t; + +// AMD GPU Sampler Coordinate Normalization Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_coord8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_COORD_UNNORMALIZED = 0, + AMDGPU_HSA_SAMPLER_COORD_NORMALIZED = 1 +} amdgpu_hsa_sampler_coord_t; + +// AMD GPU Sampler Filter Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_filter8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_FILTER_NEAREST = 0, + AMDGPU_HSA_SAMPLER_FILTER_LINEAR = 1 +} amdgpu_hsa_sampler_filter_t; + +// AMD GPU Sampler Addressing Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_addressing8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_ADDRESSING_UNDEFINED = 0, + AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_EDGE = 1, + AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_BORDER = 2, + AMDGPU_HSA_SAMPLER_ADDRESSING_REPEAT = 3, + AMDGPU_HSA_SAMPLER_ADDRESSING_MIRRORED_REPEAT = 4 +} amdgpu_hsa_sampler_addressing_t; + +// AMD GPU Sampler Descriptor. +typedef struct amdgpu_hsa_sampler_descriptor_s { + uint16_t size; + amdgpu_hsa_metadata_kind16_t kind; + amdgpu_hsa_sampler_coord8_t coord; + amdgpu_hsa_sampler_filter8_t filter; + amdgpu_hsa_sampler_addressing8_t addressing; + uint8_t reserved1; +} amdgpu_hsa_sampler_descriptor_t; + +// AMD GPU Image Geometry Enumeration Values. +typedef uint8_t amdgpu_hsa_image_geometry8_t; +typedef enum { + AMDGPU_HSA_IMAGE_GEOMETRY_1D = 0, + AMDGPU_HSA_IMAGE_GEOMETRY_2D = 1, + AMDGPU_HSA_IMAGE_GEOMETRY_3D = 2, + AMDGPU_HSA_IMAGE_GEOMETRY_1DA = 3, + AMDGPU_HSA_IMAGE_GEOMETRY_2DA = 4, + AMDGPU_HSA_IMAGE_GEOMETRY_1DB = 5, + AMDGPU_HSA_IMAGE_GEOMETRY_2DDEPTH = 6, + AMDGPU_HSA_IMAGE_GEOMETRY_2DADEPTH = 7 +} amdgpu_hsa_image_geometry_t; + +// AMD GPU Image Channel Order Enumeration Values. +typedef uint8_t amdgpu_hsa_image_channel_order8_t; +typedef enum { + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_A = 0, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_R = 1, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RX = 2, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RG = 3, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGX = 4, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RA = 5, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGB = 6, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBX = 7, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBA = 8, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_BGRA = 9, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ARGB = 10, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ABGR = 11, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGB = 12, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBX = 13, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBA = 14, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SBGRA = 15, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_INTENSITY = 16, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_LUMINANCE = 17, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH = 18, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19 +} amdgpu_hsa_image_channel_order_t; + +// AMD GPU Image Channel Type Enumeration Values. +typedef uint8_t amdgpu_hsa_image_channel_type8_t; +typedef enum { + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_555 = 5, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_565 = 6, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_INT_101010 = 7, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_FLOAT = 15 +} amdgpu_hsa_image_channel_type_t; + +// AMD GPU Image Descriptor. +typedef struct amdgpu_hsa_image_descriptor_s { + uint16_t size; + amdgpu_hsa_metadata_kind16_t kind; + amdgpu_hsa_image_geometry8_t geometry; + amdgpu_hsa_image_channel_order8_t channel_order; + amdgpu_hsa_image_channel_type8_t channel_type; + uint8_t reserved1; + uint64_t width; + uint64_t height; + uint64_t depth; + uint64_t array; +} amdgpu_hsa_image_descriptor_t; + +typedef struct amdgpu_hsa_note_code_object_version_s { + uint32_t major_version; + uint32_t minor_version; +} amdgpu_hsa_note_code_object_version_t; + +typedef struct amdgpu_hsa_note_hsail_s { + uint32_t hsail_major_version; + uint32_t hsail_minor_version; + uint8_t profile; + uint8_t machine_model; + uint8_t default_float_round; +} amdgpu_hsa_note_hsail_t; + +typedef struct amdgpu_hsa_note_isa_s { + uint16_t vendor_name_size; + uint16_t architecture_name_size; + uint32_t major; + uint32_t minor; + uint32_t stepping; + char vendor_and_architecture_name[1]; +} amdgpu_hsa_note_isa_t; + +typedef struct amdgpu_hsa_note_producer_s { + uint16_t producer_name_size; + uint16_t reserved; + uint32_t producer_major_version; + uint32_t producer_minor_version; + char producer_name[1]; +} amdgpu_hsa_note_producer_t; + +typedef struct amdgpu_hsa_note_producer_options_s { + uint16_t producer_options_size; + char producer_options[1]; +} amdgpu_hsa_note_producer_options_t; + +typedef enum { + AMDGPU_HSA_RODATA_GLOBAL_PROGRAM = 0, + AMDGPU_HSA_RODATA_GLOBAL_AGENT, + AMDGPU_HSA_RODATA_READONLY_AGENT, + AMDGPU_HSA_DATA_GLOBAL_PROGRAM, + AMDGPU_HSA_DATA_GLOBAL_AGENT, + AMDGPU_HSA_DATA_READONLY_AGENT, + AMDGPU_HSA_BSS_GLOBAL_PROGRAM, + AMDGPU_HSA_BSS_GLOBAL_AGENT, + AMDGPU_HSA_BSS_READONLY_AGENT, + AMDGPU_HSA_SECTION_LAST, +} amdgpu_hsa_elf_section_t; + +#endif // AMD_HSA_ELF_H diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_kernel_code.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_kernel_code.h new file mode 100644 index 0000000000..c00c88c024 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_kernel_code.h @@ -0,0 +1,270 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_KERNEL_CODE_H +#define AMD_HSA_KERNEL_CODE_H + +#include "amd_hsa_common.h" +#include "hsa.h" + +// AMD Kernel Code Version Enumeration Values. +typedef uint32_t amd_kernel_code_version32_t; +enum amd_kernel_code_version_t { + AMD_KERNEL_CODE_VERSION_MAJOR = 1, + AMD_KERNEL_CODE_VERSION_MINOR = 1 +}; + +// AMD Machine Kind Enumeration Values. +typedef uint16_t amd_machine_kind16_t; +enum amd_machine_kind_t { + AMD_MACHINE_KIND_UNDEFINED = 0, + AMD_MACHINE_KIND_AMDGPU = 1 +}; + +// AMD Machine Version. +typedef uint16_t amd_machine_version16_t; + +// AMD Float Round Mode Enumeration Values. +enum amd_float_round_mode_t { + AMD_FLOAT_ROUND_MODE_NEAREST_EVEN = 0, + AMD_FLOAT_ROUND_MODE_PLUS_INFINITY = 1, + AMD_FLOAT_ROUND_MODE_MINUS_INFINITY = 2, + AMD_FLOAT_ROUND_MODE_ZERO = 3 +}; + +// AMD Float Denorm Mode Enumeration Values. +enum amd_float_denorm_mode_t { + AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT = 0, + AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT = 1, + AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE = 2, + AMD_FLOAT_DENORM_MODE_NO_FLUSH = 3 +}; + +// AMD Compute Program Resource Register One. +typedef uint32_t amd_compute_pgm_rsrc_one32_t; +enum amd_compute_pgm_rsrc_one_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT, 0, 6), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY, 10, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32, 12, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64, 14, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32, 16, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 18, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 20, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP, 21, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE, 22, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 23, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_BULKY, 24, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER, 25, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1, 26, 6) +}; + +// AMD System VGPR Workitem ID Enumeration Values. +enum amd_system_vgpr_workitem_id_t { + AMD_SYSTEM_VGPR_WORKITEM_ID_X = 0, + AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y = 1, + AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2, + AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3 +}; + +// AMD Compute Program Resource Register Two. +typedef uint32_t amd_compute_pgm_rsrc_two32_t; +enum amd_compute_pgm_rsrc_two_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 1, 5), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER, 6, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 7, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO, 10, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID, 11, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION, 14, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE, 15, 9), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO, 30, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1, 31, 1) +}; + +// AMD Element Byte Size Enumeration Values. +enum amd_element_byte_size_t { + AMD_ELEMENT_BYTE_SIZE_2 = 0, + AMD_ELEMENT_BYTE_SIZE_4 = 1, + AMD_ELEMENT_BYTE_SIZE_8 = 2, + AMD_ELEMENT_BYTE_SIZE_16 = 3 +}; + +// AMD Kernel Code Properties. +typedef uint32_t amd_kernel_code_properties32_t; +enum amd_kernel_code_properties_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR, 1, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR, 2, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID, 4, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, 7, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, 8, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z, 9, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32, 10, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED1, 11, 5), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS, 16, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE, 17, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_PTR64, 19, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK, 20, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED, 21, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED, 22, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED2, 23, 9) +}; + +// AMD Power Of Two Enumeration Values. +typedef uint8_t amd_powertwo8_t; +enum amd_powertwo_t { + AMD_POWERTWO_1 = 0, + AMD_POWERTWO_2 = 1, + AMD_POWERTWO_4 = 2, + AMD_POWERTWO_8 = 3, + AMD_POWERTWO_16 = 4, + AMD_POWERTWO_32 = 5, + AMD_POWERTWO_64 = 6, + AMD_POWERTWO_128 = 7, + AMD_POWERTWO_256 = 8 +}; + +// AMD Enabled Control Directive Enumeration Values. +typedef uint64_t amd_enabled_control_directive64_t; +enum amd_enabled_control_directive_t { + AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS = 1, + AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS = 2, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE = 4, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE = 8, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE = 16, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM = 32, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE = 64, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE = 128, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS = 256 +}; + +// AMD Exception Kind Enumeration Values. +typedef uint16_t amd_exception_kind16_t; +enum amd_exception_kind_t { + AMD_EXCEPTION_KIND_INVALID_OPERATION = 1, + AMD_EXCEPTION_KIND_DIVISION_BY_ZERO = 2, + AMD_EXCEPTION_KIND_OVERFLOW = 4, + AMD_EXCEPTION_KIND_UNDERFLOW = 8, + AMD_EXCEPTION_KIND_INEXACT = 16 +}; + +// AMD Control Directives. +#define AMD_CONTROL_DIRECTIVES_ALIGN_BYTES 64 +#define AMD_CONTROL_DIRECTIVES_ALIGN __ALIGNED__(AMD_CONTROL_DIRECTIVES_ALIGN_BYTES) +typedef AMD_CONTROL_DIRECTIVES_ALIGN struct amd_control_directives_s { + amd_enabled_control_directive64_t enabled_control_directives; + uint16_t enable_break_exceptions; + uint16_t enable_detect_exceptions; + uint32_t max_dynamic_group_size; + uint64_t max_flat_grid_size; + uint32_t max_flat_workgroup_size; + uint8_t required_dim; + uint8_t reserved1[3]; + uint64_t required_grid_size[3]; + uint32_t required_workgroup_size[3]; + uint8_t reserved2[60]; +} amd_control_directives_t; + +// AMD Kernel Code. +#define AMD_ISA_ALIGN_BYTES 256 +#define AMD_KERNEL_CODE_ALIGN_BYTES 64 +#define AMD_KERNEL_CODE_ALIGN __ALIGNED__(AMD_KERNEL_CODE_ALIGN_BYTES) +typedef AMD_KERNEL_CODE_ALIGN struct amd_kernel_code_s { + amd_kernel_code_version32_t amd_kernel_code_version_major; + amd_kernel_code_version32_t amd_kernel_code_version_minor; + amd_machine_kind16_t amd_machine_kind; + amd_machine_version16_t amd_machine_version_major; + amd_machine_version16_t amd_machine_version_minor; + amd_machine_version16_t amd_machine_version_stepping; + int64_t kernel_code_entry_byte_offset; + int64_t kernel_code_prefetch_byte_offset; + uint64_t kernel_code_prefetch_byte_size; + uint64_t max_scratch_backing_memory_byte_size; + amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1; + amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2; + amd_kernel_code_properties32_t kernel_code_properties; + uint32_t workitem_private_segment_byte_size; + uint32_t workgroup_group_segment_byte_size; + uint32_t gds_segment_byte_size; + uint64_t kernarg_segment_byte_size; + uint32_t workgroup_fbarrier_count; + uint16_t wavefront_sgpr_count; + uint16_t workitem_vgpr_count; + uint16_t reserved_vgpr_first; + uint16_t reserved_vgpr_count; + uint16_t reserved_sgpr_first; + uint16_t reserved_sgpr_count; + uint16_t debug_wavefront_private_segment_offset_sgpr; + uint16_t debug_private_segment_buffer_sgpr; + amd_powertwo8_t kernarg_segment_alignment; + amd_powertwo8_t group_segment_alignment; + amd_powertwo8_t private_segment_alignment; + amd_powertwo8_t wavefront_size; + int32_t call_convention; + uint8_t reserved1[12]; + uint64_t runtime_loader_kernel_symbol; + amd_control_directives_t control_directives; +} amd_kernel_code_t; + +// TODO: this struct should be completely gone once debugger designs/implements +// Debugger APIs. +typedef struct amd_runtime_loader_debug_info_s { + const void* elf_raw; + size_t elf_size; + const char *kernel_name; + const void *owning_segment; +} amd_runtime_loader_debug_info_t; + +#endif // AMD_HSA_KERNEL_CODE_H diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_queue.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_queue.h new file mode 100644 index 0000000000..9f16f9b2e5 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_queue.h @@ -0,0 +1,154 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_QUEUE_H +#define AMD_HSA_QUEUE_H + +#include "amd_hsa_common.h" +#include "hsa.h" + +// AMD Queue Properties. +typedef uint32_t amd_queue_properties32_t; +enum amd_queue_properties_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_IS_PTR64, 1, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE, 4, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 5, 27) +}; + +// AMD Queue. +#define AMD_QUEUE_ALIGN_BYTES 64 +#define AMD_QUEUE_ALIGN __ALIGNED__(AMD_QUEUE_ALIGN_BYTES) + +// AMD Queue Capabilities. +typedef uint32_t amd_queue_capabilities32_t; +enum amd_queue_capabilities_t { + /* This version of CP FW supports dual-scratch and async-reclaim */ + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_CAPS_CP_ASYNC_RECLAIM, 0, 1), + + /* + * This version of ROCr supports async-reclaim and CP FW may access the + * V2 fields. + */ + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_CAPS_SW_ASYNC_RECLAIM, 1, 1), +}; + +/* This is the original amd_queue_t structure. The definition is only kept + * for reference purposes. This structure should not be used. */ +typedef struct AMD_QUEUE_ALIGN amd_queue_s { + hsa_queue_t hsa_queue; + uint32_t caps; + uint32_t reserved1[3]; + volatile uint64_t write_dispatch_id; + uint32_t group_segment_aperture_base_hi; + uint32_t private_segment_aperture_base_hi; + uint32_t max_cu_id; + uint32_t max_wave_id; + volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1; + volatile uint32_t legacy_doorbell_lock; + uint32_t reserved2[9]; + volatile uint64_t read_dispatch_id; + uint32_t read_dispatch_id_field_base_byte_offset; + uint32_t compute_tmpring_size; + uint32_t scratch_resource_descriptor[4]; + uint64_t scratch_backing_memory_location; + uint32_t reserved3[2]; + uint32_t scratch_wave64_lane_byte_size; + amd_queue_properties32_t queue_properties; + uint32_t reserved4[2]; + hsa_signal_t queue_inactive_signal; + uint32_t reserved5[14]; +} amd_queue_t; + +/* + * AMD_QUEUE Version 2 + * amd_queue_v2_t is backwards compatible with amd_queue_t structure and can + * be used with previous versions of CP FW. The added fields tagged as V2 are + * ignored when running previous versions of CP FW. + * CP FW will not try to access elements beyond the original 64-bytes + * (sizeof(amd_queue_t)) unless the AMD_QUEUE_CAPS_SW_ASYNC_RECLAIM bit is set. + */ + +#define MAX_NUM_XCC 128 +typedef struct scratch_last_used_index_xcc_s { + volatile uint64_t main; + volatile uint64_t alt; +} scratch_last_used_index_xcc_t; + +typedef struct AMD_QUEUE_ALIGN amd_queue_v2_s { + hsa_queue_t hsa_queue; + uint32_t caps; + uint32_t reserved1[3]; + volatile uint64_t write_dispatch_id; + uint32_t group_segment_aperture_base_hi; + uint32_t private_segment_aperture_base_hi; + uint32_t max_cu_id; + uint32_t max_wave_id; + volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1; + volatile uint32_t legacy_doorbell_lock; + uint32_t reserved2[9]; + volatile uint64_t read_dispatch_id; + uint32_t read_dispatch_id_field_base_byte_offset; + uint32_t compute_tmpring_size; + uint32_t scratch_resource_descriptor[4]; + uint64_t scratch_backing_memory_location; + uint64_t scratch_backing_memory_byte_size; + uint32_t scratch_wave64_lane_byte_size; + amd_queue_properties32_t queue_properties; + volatile uint64_t scratch_max_use_index; /* V2 */ + hsa_signal_t queue_inactive_signal; + volatile uint64_t alt_scratch_max_use_index; /* V2 */ + uint32_t alt_scratch_resource_descriptor[4]; /* V2 */ + uint64_t alt_scratch_backing_memory_location; /* V2 */ + uint32_t alt_scratch_dispatch_limit_x; /* V2 */ + uint32_t alt_scratch_dispatch_limit_y; /* V2 */ + uint32_t alt_scratch_dispatch_limit_z; /* V2 */ + uint32_t alt_scratch_wave64_lane_byte_size; /* V2 */ + uint32_t alt_compute_tmpring_size; /* V2 */ + uint32_t reserved5; + + scratch_last_used_index_xcc_t scratch_last_used_index[MAX_NUM_XCC]; +} amd_queue_v2_t; + +#endif // AMD_HSA_QUEUE_H diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_signal.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_signal.h new file mode 100644 index 0000000000..fa797599a0 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_signal.h @@ -0,0 +1,79 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_SIGNAL_H +#define AMD_HSA_SIGNAL_H + +#include "amd_hsa_common.h" +#include "amd_hsa_queue.h" + +// AMD Signal Kind Enumeration Values. +typedef int64_t amd_signal_kind64_t; +enum amd_signal_kind_t { + AMD_SIGNAL_KIND_INVALID = 0, + AMD_SIGNAL_KIND_USER = 1, + AMD_SIGNAL_KIND_DOORBELL = -1, + AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2 +}; + +// AMD Signal. +#define AMD_SIGNAL_ALIGN_BYTES 64 +#define AMD_SIGNAL_ALIGN __ALIGNED__(AMD_SIGNAL_ALIGN_BYTES) +typedef struct AMD_SIGNAL_ALIGN amd_signal_s { + amd_signal_kind64_t kind; + union { + volatile int64_t value; + volatile uint64_t* hardware_doorbell_ptr; + }; + uint64_t event_mailbox_ptr; + uint32_t event_id; + uint32_t reserved1; + uint64_t start_ts; + uint64_t end_ts; + union { + amd_queue_v2_t* queue_ptr; + uint64_t reserved2; + }; + uint32_t reserved3[2]; +} amd_signal_t; + +#endif // AMD_HSA_SIGNAL_H diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa.h new file mode 100644 index 0000000000..00753e992e --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa.h @@ -0,0 +1,5752 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_H_ +#define HSA_RUNTIME_INC_HSA_H_ + +#include /* size_t */ +#include /* uintXX_t */ + +#ifndef __cplusplus +#include /* bool */ +#endif /* __cplusplus */ + +// Placeholder for calling convention and import/export macros +#ifndef HSA_CALL +#define HSA_CALL +#endif + +#ifndef HSA_EXPORT_DECORATOR +#ifdef __GNUC__ +#define HSA_EXPORT_DECORATOR __attribute__ ((visibility ("default"))) +#else +#define HSA_EXPORT_DECORATOR +#endif +#endif +#define HSA_API_EXPORT HSA_EXPORT_DECORATOR HSA_CALL +#define HSA_API_IMPORT HSA_CALL + +#if !defined(HSA_API) && defined(HSA_EXPORT) +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +// Detect and set large model builds. +#undef HSA_LARGE_MODEL +#if defined(__LP64__) || defined(_M_X64) +#define HSA_LARGE_MODEL +#endif + +// Try to detect CPU endianness +#if !defined(LITTLEENDIAN_CPU) && !defined(BIGENDIAN_CPU) +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define LITTLEENDIAN_CPU +#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define BIGENDIAN_CPU +#elif defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) || defined(__loongarch64) || defined(__riscv) +#define LITTLEENDIAN_CPU +#endif +#endif + +#undef HSA_LITTLE_ENDIAN +#if defined(LITTLEENDIAN_CPU) +#define HSA_LITTLE_ENDIAN +#elif defined(BIGENDIAN_CPU) +#else +#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined" +#endif + +#ifndef HSA_DEPRECATED +#define HSA_DEPRECATED +//#ifdef __GNUC__ +//#define HSA_DEPRECATED __attribute__((deprecated)) +//#else +//#define HSA_DEPRECATED __declspec(deprecated) +//#endif +#endif + +#define HSA_VERSION_1_0 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** \addtogroup error-codes Error codes + * @{ + */ + +/** + * @brief Status codes. + */ +typedef enum { + /** + * The function has been executed successfully. + */ + HSA_STATUS_SUCCESS = 0x0, + /** + * A traversal over a list of elements has been interrupted by the + * application before completing. + */ + HSA_STATUS_INFO_BREAK = 0x1, + /** + * A generic error has occurred. + */ + HSA_STATUS_ERROR = 0x1000, + /** + * One of the actual arguments does not meet a precondition stated in the + * documentation of the corresponding formal argument. + */ + HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001, + /** + * The requested queue creation is not valid. + */ + HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002, + /** + * The requested allocation is not valid. + */ + HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003, + /** + * The agent is invalid. + */ + HSA_STATUS_ERROR_INVALID_AGENT = 0x1004, + /** + * The memory region is invalid. + */ + HSA_STATUS_ERROR_INVALID_REGION = 0x1005, + /** + * The signal is invalid. + */ + HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006, + /** + * The queue is invalid. + */ + HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007, + /** + * The HSA runtime failed to allocate the necessary resources. This error + * may also occur when the HSA runtime needs to spawn threads or create + * internal OS-specific events. + */ + HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008, + /** + * The AQL packet is malformed. + */ + HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009, + /** + * An error has been detected while releasing a resource. + */ + HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A, + /** + * An API other than ::hsa_init has been invoked while the reference count + * of the HSA runtime is 0. + */ + HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B, + /** + * The maximum reference count for the object has been reached. + */ + HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C, + /** + * The arguments passed to a functions are not compatible. + */ + HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D, + /** + * The index is invalid. + */ + HSA_STATUS_ERROR_INVALID_INDEX = 0x100E, + /** + * The instruction set architecture is invalid. + */ + HSA_STATUS_ERROR_INVALID_ISA = 0x100F, + /** + * The instruction set architecture name is invalid. + */ + HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017, + /** + * The code object is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010, + /** + * The executable is invalid. + */ + HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011, + /** + * The executable is frozen. + */ + HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012, + /** + * There is no symbol with the given name. + */ + HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013, + /** + * The variable is already defined. + */ + HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014, + /** + * The variable is undefined. + */ + HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015, + /** + * An HSAIL operation resulted in a hardware exception. + */ + HSA_STATUS_ERROR_EXCEPTION = 0x1016, + /** + * The code object symbol is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_SYMBOL = 0x1018, + /** + * The executable symbol is invalid. + */ + HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL = 0x1019, + /** + * The file descriptor is invalid. + */ + HSA_STATUS_ERROR_INVALID_FILE = 0x1020, + /** + * The code object reader is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER = 0x1021, + /** + * The cache is invalid. + */ + HSA_STATUS_ERROR_INVALID_CACHE = 0x1022, + /** + * The wavefront is invalid. + */ + HSA_STATUS_ERROR_INVALID_WAVEFRONT = 0x1023, + /** + * The signal group is invalid. + */ + HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP = 0x1024, + /** + * The HSA runtime is not in the configuration state. + */ + HSA_STATUS_ERROR_INVALID_RUNTIME_STATE = 0x1025, + /** + * The queue received an error that may require process termination. + */ + HSA_STATUS_ERROR_FATAL = 0x1026 +} hsa_status_t; + +/** + * @brief Query additional information about a status code. + * + * @param[in] status Status code. + * + * @param[out] status_string A NUL-terminated string that describes the error + * status. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p status is an invalid + * status code, or @p status_string is NULL. + */ +hsa_status_t HSA_API hsa_status_string( + hsa_status_t status, + const char ** status_string); + +/** @} */ + +/** \defgroup common Common Definitions + * @{ + */ + +/** + * @brief Three-dimensional coordinate. + */ +typedef struct hsa_dim3_s { + /** + * X dimension. + */ + uint32_t x; + + /** + * Y dimension. + */ + uint32_t y; + + /** + * Z dimension. + */ + uint32_t z; +} hsa_dim3_t; + +/** + * @brief Access permissions. + */ +typedef enum { + /** + * Used to remove existing access + */ + HSA_ACCESS_PERMISSION_NONE = 0, + /** + * Read-only access. + */ + HSA_ACCESS_PERMISSION_RO = 1, + /** + * Write-only access. + */ + HSA_ACCESS_PERMISSION_WO = 2, + /** + * Read and write access. + */ + HSA_ACCESS_PERMISSION_RW = 3 +} hsa_access_permission_t; + +/** + * @brief POSIX file descriptor. + */ +typedef int hsa_file_t; + +/** @} **/ + + +/** \defgroup initshutdown Initialization and Shut Down + * @{ + */ + +/** + * @brief Initialize the HSA runtime. + * + * @details Initializes the HSA runtime if it is not already initialized, and + * increases the reference counter associated with the HSA runtime for the + * current process. Invocation of any HSA function other than ::hsa_init results + * in undefined behavior if the current HSA runtime reference counter is less + * than one. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_REFCOUNT_OVERFLOW The HSA runtime reference + * count reaches INT32_MAX. + */ +hsa_status_t HSA_API hsa_init(); + +/** + * @brief Shut down the HSA runtime. + * + * @details Decreases the reference count of the HSA runtime instance. When the + * reference count reaches 0, the HSA runtime is no longer considered valid + * but the application might call ::hsa_init to initialize the HSA runtime + * again. + * + * Once the reference count of the HSA runtime reaches 0, all the resources + * associated with it (queues, signals, agent information, etc.) are + * considered invalid and any attempt to reference them in subsequent API calls + * results in undefined behavior. When the reference count reaches 0, the HSA + * runtime may release resources associated with it. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_shut_down(); + +/** @} **/ + +/** \defgroup agentinfo System and Agent Information + * @{ + */ + +/** + * @brief Endianness. A convention used to interpret the bytes making up a data + * word. + */ +typedef enum { + /** + * The least significant byte is stored in the smallest address. + */ + HSA_ENDIANNESS_LITTLE = 0, + /** + * The most significant byte is stored in the smallest address. + */ + HSA_ENDIANNESS_BIG = 1 +} hsa_endianness_t; + +/** + * @brief Machine model. A machine model determines the size of certain data + * types in HSA runtime and an agent. + */ +typedef enum { + /** + * Small machine model. Addresses use 32 bits. + */ + HSA_MACHINE_MODEL_SMALL = 0, + /** + * Large machine model. Addresses use 64 bits. + */ + HSA_MACHINE_MODEL_LARGE = 1 +} hsa_machine_model_t; + +/** + * @brief Profile. A profile indicates a particular level of feature + * support. For example, in the base profile the application must use the HSA + * runtime allocator to reserve shared virtual memory, while in the full profile + * any host pointer can be shared across all the agents. + */ +typedef enum { + /** + * Base profile. + */ + HSA_PROFILE_BASE = 0, + /** + * Full profile. + */ + HSA_PROFILE_FULL = 1 +} hsa_profile_t; + +/** + * @brief System attributes. + */ +typedef enum { + /** + * Major version of the HSA runtime specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_SYSTEM_INFO_VERSION_MAJOR = 0, + /** + * Minor version of the HSA runtime specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_SYSTEM_INFO_VERSION_MINOR = 1, + /** + * Current timestamp. The value of this attribute monotonically increases at a + * constant rate. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_TIMESTAMP = 2, + /** + * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is + * in the range 1-400MHz. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3, + /** + * Maximum duration of a signal wait operation. Expressed as a count based on + * the timestamp frequency. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT = 4, + /** + * Endianness of the system. The type of this attribute is ::hsa_endianness_t. + */ + HSA_SYSTEM_INFO_ENDIANNESS = 5, + /** + * Machine model supported by the HSA runtime. The type of this attribute is + * ::hsa_machine_model_t. + */ + HSA_SYSTEM_INFO_MACHINE_MODEL = 6, + /** + * Bit-mask indicating which extensions are supported by the + * implementation. An extension with an ID of @p i is supported if the bit at + * position @p i is set. The type of this attribute is uint8_t[128]. + */ + HSA_SYSTEM_INFO_EXTENSIONS = 7, + /** + * String containing the ROCr build identifier. + */ + HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200, + /** + * Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201, + // TODO: Should this be per Agent? + /** + * Returns true if all Agents have access to system allocated memory (such as + * that allocated by mmap, malloc, or new) by default. + * If false then system allocated memory may only be made SVM accessible to + * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes. + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202, + /** + * Returns true if mwaitx is enabled on this system + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_MWAITX_ENABLED = 0x203, + /** + * Returns true if DMABUF APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_DMABUF_SUPPORTED = 0x204, + /** + * Returns true if Virtual Memory APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED = 0x205, + /** + * Returns true if XNACK is enabled on this system. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_XNACK_ENABLED = 0x206, + /** + * Major version of the HSA runtime extension specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_AMD_SYSTEM_INFO_EXT_VERSION_MAJOR = 0x207, + /** + * Minor version of the HSA runtime extension specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_AMD_SYSTEM_INFO_EXT_VERSION_MINOR = 0x208, +} hsa_system_info_t; + +/** + * @brief Get the current value of a system attribute. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * system attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_system_get_info( + hsa_system_info_t attribute, + void* value); + +/** + * @brief HSA extensions. + */ +typedef enum { + /** + * Finalizer extension. + */ + HSA_EXTENSION_FINALIZER = 0, + /** + * Images extension. + */ + HSA_EXTENSION_IMAGES = 1, + + /** + * Performance counter extension. + */ + HSA_EXTENSION_PERFORMANCE_COUNTERS = 2, + + /** + * Profiling events extension. + */ + HSA_EXTENSION_PROFILING_EVENTS = 3, + /** + * Extension count. + */ + HSA_EXTENSION_STD_LAST = 3, + /** + * First AMD extension number. + */ + HSA_AMD_FIRST_EXTENSION = 0x200, + /** + * Profiler extension. + */ + HSA_EXTENSION_AMD_PROFILER = 0x200, + /** + * Loader extension. + */ + HSA_EXTENSION_AMD_LOADER = 0x201, + /** + * AqlProfile extension. + */ + HSA_EXTENSION_AMD_AQLPROFILE = 0x202, + /** + * PC Sampling extension. + */ + HSA_EXTENSION_AMD_PC_SAMPLING = 0x203, + /** + * Last AMD extension. + */ + HSA_AMD_LAST_EXTENSION = 0x203 +} hsa_extension_t; + +/** + * @brief Query the name of a given extension. + * + * @param[in] extension Extension identifier. If the extension is not supported + * by the implementation (see ::HSA_SYSTEM_INFO_EXTENSIONS), the behavior + * is undefined. + * + * @param[out] name Pointer to a memory location where the HSA runtime stores + * the extension name. The extension name is a NUL-terminated string. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p name is NULL. + */ +hsa_status_t HSA_API hsa_extension_get_name( + uint16_t extension, + const char **name); + +/** + * @deprecated + * + * @brief Query if a given version of an extension is supported by the HSA + * implementation. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number. + * + * @param[in] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p result is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_system_extension_supported( + uint16_t extension, + uint16_t version_major, + uint16_t version_minor, + bool* result); + +/** + * @brief Query if a given version of an extension is supported by the HSA + * implementation. All minor versions from 0 up to the returned @p version_minor + * must be supported by the implementation. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number. + * + * @param[out] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p version_minor is NULL, or @p result is NULL. + */ +hsa_status_t HSA_API hsa_system_major_extension_supported( + uint16_t extension, + uint16_t version_major, + uint16_t *version_minor, + bool* result); + + +/** + * @deprecated + * + * @brief Retrieve the function pointers corresponding to a given version of an + * extension. Portable applications are expected to invoke the extension API + * using the returned function pointers + * + * @details The application is responsible for verifying that the given version + * of the extension is supported by the HSA implementation (see + * ::hsa_system_extension_supported). If the given combination of extension, + * major version, and minor version is not supported by the implementation, the + * behavior is undefined. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number for which to retrieve the + * function pointer table. + * + * @param[in] version_minor Minor version number for which to retrieve the + * function pointer table. + * + * @param[out] table Pointer to an application-allocated function pointer table + * that is populated by the HSA runtime. Must not be NULL. The memory associated + * with table can be reused or freed after the function returns. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p table is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_system_get_extension_table( + uint16_t extension, + uint16_t version_major, + uint16_t version_minor, + void *table); + +/** + * @brief Retrieve the function pointers corresponding to a given major version + * of an extension. Portable applications are expected to invoke the extension + * API using the returned function pointers. + * + * @details The application is responsible for verifying that the given major + * version of the extension is supported by the HSA implementation (see + * ::hsa_system_major_extension_supported). If the given combination of extension + * and major version is not supported by the implementation, the behavior is + * undefined. Additionally if the length doesn't allow space for a full minor + * version, it is implementation defined if only some of the function pointers for + * that minor version get written. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number for which to retrieve the + * function pointer table. + * + * @param[in] table_length Size in bytes of the function pointer table to be + * populated. The implementation will not write more than this many bytes to the + * table. + * + * @param[out] table Pointer to an application-allocated function pointer table + * that is populated by the HSA runtime. Must not be NULL. The memory associated + * with table can be reused or freed after the function returns. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p table is NULL. + */ +hsa_status_t HSA_API hsa_system_get_major_extension_table( + uint16_t extension, + uint16_t version_major, + size_t table_length, + void *table); + +/** + * @brief Struct containing an opaque handle to an agent, a device that participates in + * the HSA memory model. An agent can submit AQL packets for execution, and + * may also accept AQL packets for execution (agent dispatch packets or kernel + * dispatch packets launching HSAIL-derived binaries). + */ +typedef struct hsa_agent_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_agent_t; + +/** + * @brief Agent features. + */ +typedef enum { + /** + * The agent supports AQL packets of kernel dispatch type. If this + * feature is enabled, the agent is also a kernel agent. + */ + HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1, + /** + * The agent supports AQL packets of agent dispatch type. + */ + HSA_AGENT_FEATURE_AGENT_DISPATCH = 2 +} hsa_agent_feature_t; + +/** + * @brief Hardware device type. + */ +typedef enum { + /** + * CPU device. + */ + HSA_DEVICE_TYPE_CPU = 0, + /** + * GPU device. + */ + HSA_DEVICE_TYPE_GPU = 1, + /** + * DSP device. + */ + HSA_DEVICE_TYPE_DSP = 2, + /** + * AI Engine (AIE) device. + */ + HSA_DEVICE_TYPE_AIE = 3 +} hsa_device_type_t; + +/** + * @brief Default floating-point rounding mode. + */ +typedef enum { + /** + * Use a default floating-point rounding mode specified elsewhere. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0, + /** + * Operations that specify the default floating-point mode are rounded to zero + * by default. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1, + /** + * Operations that specify the default floating-point mode are rounded to the + * nearest representable number and that ties should be broken by selecting + * the value with an even least significant bit. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2 +} hsa_default_float_rounding_mode_t; + +/** + * @brief Agent attributes. + */ +typedef enum { + /** + * Agent name. The type of this attribute is a NUL-terminated char[64]. The + * name must be at most 63 characters long (not including the NUL terminator) + * and all array elements not used for the name must be NUL. + */ + HSA_AGENT_INFO_NAME = 0, + /** + * Name of vendor. The type of this attribute is a NUL-terminated char[64]. + * The name must be at most 63 characters long (not including the NUL + * terminator) and all array elements not used for the name must be NUL. + */ + HSA_AGENT_INFO_VENDOR_NAME = 1, + /** + * Agent capability. The type of this attribute is ::hsa_agent_feature_t. + */ + HSA_AGENT_INFO_FEATURE = 2, + /** + * @deprecated Query ::HSA_ISA_INFO_MACHINE_MODELS for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Machine model supported by the agent. The type of this attribute is + * ::hsa_machine_model_t. + */ + HSA_AGENT_INFO_MACHINE_MODEL = 3, + /** + * @deprecated Query ::HSA_ISA_INFO_PROFILES for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Profile supported by the agent. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_AGENT_INFO_PROFILE = 4, + /** + * @deprecated Query ::HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES for a given + * intruction set architecture supported by the agent instead. If more than + * one ISA is supported by the agent, the returned value corresponds to the + * first ISA enumerated by ::hsa_agent_iterate_isas. + * + * Default floating-point rounding mode. The type of this attribute is + * ::hsa_default_float_rounding_mode_t, but the value + * ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT is not allowed. + */ + HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5, + /** + * @deprecated Query ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES + * for a given intruction set architecture supported by the agent instead. If + * more than one ISA is supported by the agent, the returned value corresponds + * to the first ISA enumerated by ::hsa_agent_iterate_isas. + * + * A bit-mask of ::hsa_default_float_rounding_mode_t values, representing the + * default floating-point rounding modes supported by the agent in the Base + * profile. The type of this attribute is uint32_t. The default floating-point + * rounding mode (::HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE) bit must not + * be set. + */ + HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 23, + /** + * @deprecated Query ::HSA_ISA_INFO_FAST_F16_OPERATION for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Flag indicating that the f16 HSAIL operation is at least as fast as the + * f32 operation in the current agent. The value of this attribute is + * undefined if the agent is not a kernel agent. The type of this + * attribute is bool. + */ + HSA_AGENT_INFO_FAST_F16_OPERATION = 24, + /** + * @deprecated Query ::HSA_WAVEFRONT_INFO_SIZE for a given wavefront and + * intruction set architecture supported by the agent instead. If more than + * one ISA is supported by the agent, the returned value corresponds to the + * first ISA enumerated by ::hsa_agent_iterate_isas and the first wavefront + * enumerated by ::hsa_isa_iterate_wavefronts for that ISA. + * + * Number of work-items in a wavefront. Must be a power of 2 in the range + * [1,256]. The value of this attribute is undefined if the agent is not + * a kernel agent. The type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_WAVEFRONT_SIZE = 6, + /** + * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_DIM for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum number of work-items of each dimension of a work-group. Each + * maximum must be greater than 0. No maximum can exceed the value of + * ::HSA_AGENT_INFO_WORKGROUP_MAX_SIZE. The value of this attribute is + * undefined if the agent is not a kernel agent. The type of this + * attribute is uint16_t[3]. + */ + HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7, + /** + * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum total number of work-items in a work-group. The value of this + * attribute is undefined if the agent is not a kernel agent. The type + * of this attribute is uint32_t. + */ + HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8, + /** + * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_DIM for a given intruction set + * architecture supported by the agent instead. + * + * Maximum number of work-items of each dimension of a grid. Each maximum must + * be greater than 0, and must not be smaller than the corresponding value in + * ::HSA_AGENT_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of + * ::HSA_AGENT_INFO_GRID_MAX_SIZE. The value of this attribute is undefined + * if the agent is not a kernel agent. The type of this attribute is + * ::hsa_dim3_t. + */ + HSA_AGENT_INFO_GRID_MAX_DIM = 9, + /** + * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_SIZE for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum total number of work-items in a grid. The value of this attribute + * is undefined if the agent is not a kernel agent. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_GRID_MAX_SIZE = 10, + /** + * @deprecated Query ::HSA_ISA_INFO_FBARRIER_MAX_SIZE for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum number of fbarriers per work-group. Must be at least 32. The value + * of this attribute is undefined if the agent is not a kernel agent. The + * type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11, + /** + * @deprecated The maximum number of queues is not statically determined. + * + * Maximum number of queues that can be active (created but not destroyed) at + * one time in the agent. The type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_QUEUES_MAX = 12, + /** + * Minimum number of packets that a queue created in the agent + * can hold. Must be a power of 2 greater than 0. Must not exceed + * the value of ::HSA_AGENT_INFO_QUEUE_MAX_SIZE. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13, + /** + * Maximum number of packets that a queue created in the agent can + * hold. Must be a power of 2 greater than 0. The type of this attribute + * is uint32_t. + */ + HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14, + /** + * Type of a queue created in the agent. The type of this attribute is + * ::hsa_queue_type32_t. + */ + HSA_AGENT_INFO_QUEUE_TYPE = 15, + /** + * @deprecated NUMA information is not exposed anywhere else in the API. + * + * Identifier of the NUMA node associated with the agent. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_NODE = 16, + /** + * Type of hardware device associated with the agent. The type of this + * attribute is ::hsa_device_type_t. + */ + HSA_AGENT_INFO_DEVICE = 17, + /** + * @deprecated Query ::hsa_agent_iterate_caches to retrieve information about + * the caches present in a given agent. + * + * Array of data cache sizes (L1..L4). Each size is expressed in bytes. A size + * of 0 for a particular level indicates that there is no cache information + * for that level. The type of this attribute is uint32_t[4]. + */ + HSA_AGENT_INFO_CACHE_SIZE = 18, + /** + * @deprecated An agent may support multiple instruction set + * architectures. See ::hsa_agent_iterate_isas. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Instruction set architecture of the agent. The type of this attribute + * is ::hsa_isa_t. + */ + HSA_AGENT_INFO_ISA = 19, + /** + * Bit-mask indicating which extensions are supported by the agent. An + * extension with an ID of @p i is supported if the bit at position @p i is + * set. The type of this attribute is uint8_t[128]. + */ + HSA_AGENT_INFO_EXTENSIONS = 20, + /** + * Major version of the HSA runtime specification supported by the + * agent. The type of this attribute is uint16_t. + */ + HSA_AGENT_INFO_VERSION_MAJOR = 21, + /** + * Minor version of the HSA runtime specification supported by the + * agent. The type of this attribute is uint16_t. + */ + HSA_AGENT_INFO_VERSION_MINOR = 22, + /** + * This enum does not have a fixed underlying type, thus in C++ post D2338: + * If the enumeration type does not have a fixed underlying type, the value is + * unchanged if the original value is within the range of the enumeration + * values (9.7.1 [dcl.enum]), and otherwise, the behavior is + * undefined. + * Thus increase the range of this enum to encompass vendor extensions. + */ + HSA_AGENT_INFO_LAST = INT32_MAX +} hsa_agent_info_t; + +/** + * @brief Get the current value of an attribute for a given agent. + * + * @param[in] agent A valid agent. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * agent attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_agent_get_info( + hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +/** + * @brief Iterate over the available agents, and invoke an + * application-defined callback on every iteration. + * + * @param[in] callback Callback to be invoked once per agent. The HSA + * runtime passes two arguments to the callback: the agent and the + * application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_iterate_agents returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. +*/ +hsa_status_t HSA_API hsa_iterate_agents( + hsa_status_t (*callback)(hsa_agent_t agent, void* data), + void* data); + +/* + +// If we do not know the size of an attribute, we need to query it first +// Note: this API will not be in the spec unless needed +hsa_status_t HSA_API hsa_agent_get_info_size( + hsa_agent_t agent, + hsa_agent_info_t attribute, + size_t* size); + +// Set the value of an agents attribute +// Note: this API will not be in the spec unless needed +hsa_status_t HSA_API hsa_agent_set_info( + hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +*/ + +/** + * @brief Exception policies applied in the presence of hardware exceptions. + */ +typedef enum { + /** + * If a hardware exception is detected, a work-item signals an exception. + */ + HSA_EXCEPTION_POLICY_BREAK = 1, + /** + * If a hardware exception is detected, a hardware status bit is set. + */ + HSA_EXCEPTION_POLICY_DETECT = 2 +} hsa_exception_policy_t; + +/** + * @deprecated Use ::hsa_isa_get_exception_policies for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, this function uses the first value returned by + * ::hsa_agent_iterate_isas. + * + * @brief Retrieve the exception policy support for a given combination of + * agent and profile + * + * @param[in] agent Agent. + * + * @param[in] profile Profile. + * + * @param[out] mask Pointer to a memory location where the HSA runtime stores a + * mask of ::hsa_exception_policy_t values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid + * profile, or @p mask is NULL. + * + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_get_exception_policies( + hsa_agent_t agent, + hsa_profile_t profile, + uint16_t *mask); + +/** + * @brief Cache handle. + */ +typedef struct hsa_cache_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_cache_t; + +/** + * @brief Cache attributes. + */ +typedef enum { + /** + * The length of the cache name in bytes, not including the NUL terminator. + * The type of this attribute is uint32_t. + */ + HSA_CACHE_INFO_NAME_LENGTH = 0, + /** + * Human-readable description. The type of this attribute is a NUL-terminated + * character array with the length equal to the value of + * ::HSA_CACHE_INFO_NAME_LENGTH attribute. + */ + HSA_CACHE_INFO_NAME = 1, + /** + * Cache level. A L1 cache must return a value of 1, a L2 must return a value + * of 2, and so on. The type of this attribute is uint8_t. + */ + HSA_CACHE_INFO_LEVEL = 2, + /** + * Cache size, in bytes. A value of 0 indicates that there is no size + * information available. The type of this attribute is uint32_t. + */ + HSA_CACHE_INFO_SIZE = 3 +} hsa_cache_info_t; + +/** + * @brief Get the current value of an attribute for a given cache object. + * + * @param[in] cache Cache. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CACHE The cache is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is + * NULL. + */ +hsa_status_t HSA_API hsa_cache_get_info( + hsa_cache_t cache, + hsa_cache_info_t attribute, + void* value); + +/** + * @brief Iterate over the memory caches of a given agent, and + * invoke an application-defined callback on every iteration. + * + * @details Caches are visited in ascending order according to the value of the + * ::HSA_CACHE_INFO_LEVEL attribute. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per cache that is present in + * the agent. The HSA runtime passes two arguments to the callback: the cache + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * that value is returned. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_caches( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_cache_t cache, void* data), + void* data); + +/** + * @deprecated + * + * @brief Query if a given version of an extension is supported by an agent + * + * @param[in] extension Extension identifier. + * + * @param[in] agent Agent. + * + * @param[in] version_major Major version number. + * + * @param[in] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. The result must be false if + * ::hsa_system_extension_supported returns false for the same extension + * version. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p result is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_extension_supported( + uint16_t extension, + hsa_agent_t agent, + uint16_t version_major, + uint16_t version_minor, + bool* result); + +/** + * @brief Query if a given version of an extension is supported by an agent. All + * minor versions from 0 up to the returned @p version_minor must be supported. + * + * @param[in] extension Extension identifier. + * + * @param[in] agent Agent. + * + * @param[in] version_major Major version number. + * + * @param[out] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. The result must be false if + * ::hsa_system_extension_supported returns false for the same extension + * version. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p version_minor is NULL, or @p result is NULL. + */ +hsa_status_t HSA_API hsa_agent_major_extension_supported( + uint16_t extension, + hsa_agent_t agent, + uint16_t version_major, + uint16_t *version_minor, + bool* result); + + +/** @} */ + + +/** \defgroup signals Signals + * @{ + */ + +/** + * @brief Signal handle. + */ +typedef struct hsa_signal_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. The value 0 is reserved. + */ + uint64_t handle; +} hsa_signal_t; + +/** + * @brief Signal value. The value occupies 32 bits in small machine mode, and 64 + * bits in large machine mode. + */ +#ifdef HSA_LARGE_MODEL + typedef int64_t hsa_signal_value_t; +#else + typedef int32_t hsa_signal_value_t; +#endif + +/** + * @brief Create a signal. + * + * @param[in] initial_value Initial value of the signal. + * + * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that + * any agent might wait on the signal. + * + * @param[in] consumers List of agents that might consume (wait on) the + * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the + * HSA runtime might use the list to optimize the handling of the signal + * object. If an agent not listed in @p consumers waits on the returned + * signal, the behavior is undefined. The memory associated with @p consumers + * can be reused or freed after the function returns. + * + * @param[out] signal Pointer to a memory location where the HSA runtime will + * store the newly created signal handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p + * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers + * contains duplicates. + */ +hsa_status_t HSA_API hsa_signal_create( + hsa_signal_value_t initial_value, + uint32_t num_consumers, + const hsa_agent_t *consumers, + hsa_signal_t *signal); + +/** + * @brief Destroy a signal previous created by ::hsa_signal_create. + * + * @param[in] signal Signal. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The handle in @p signal is 0. + */ +hsa_status_t HSA_API hsa_signal_destroy( + hsa_signal_t signal); + +/** + * @brief Atomically read the current value of a signal. + * + * @param[in] signal Signal. + * + * @return Value of the signal. +*/ +hsa_signal_value_t HSA_API hsa_signal_load_scacquire( + hsa_signal_t signal); + +/** + * @copydoc hsa_signal_load_scacquire + */ +hsa_signal_value_t HSA_API hsa_signal_load_relaxed( + hsa_signal_t signal); + +/** + * @deprecated Renamed as ::hsa_signal_load_scacquire. + * + * @copydoc hsa_signal_load_scacquire +*/ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_load_acquire( + hsa_signal_t signal); + +/** + * @brief Atomically set the value of a signal. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. + * + * @param[in] value New signal value. + */ +void HSA_API hsa_signal_store_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_store_relaxed + */ +void HSA_API hsa_signal_store_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_store_screlease. + * + * @copydoc hsa_signal_store_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_store_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal without necessarily notifying the + * the agents waiting on it. + * + * @details The agents waiting on @p signal may not wake up even when the new + * value satisfies their wait condition. If the application wants to update the + * signal and there is no need to notify any agent, invoking this function can + * be more efficient than calling the non-silent counterpart. + * + * @param[in] signal Signal. + * + * @param[in] value New signal value. + */ +void HSA_API hsa_signal_silent_store_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_silent_store_relaxed + */ +void HSA_API hsa_signal_silent_store_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal and return its previous value. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value New value. + * + * @return Value of the signal prior to the exchange. + * + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_exchange_scacq_screl. + * + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_exchange_scacquire. + * + * @copydoc hsa_signal_exchange_scacquire + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); +/** + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_exchange_screlease. + * + * @copydoc hsa_signal_exchange_screlease + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal if the observed value is equal to + * the expected value. The observed value is returned regardless of whether the + * replacement was done. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue + * doorbell signal, the behavior is undefined. + * + * @param[in] expected Value to compare with. + * + * @param[in] value New value. + * + * @return Observed value of the signal. + * + */ +hsa_signal_value_t HSA_API hsa_signal_cas_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_cas_scacq_screl. + * + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_cas_scacquire( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_cas_scacquire. + * + * @copydoc hsa_signal_cas_scacquire + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acquire( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_cas_relaxed( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_cas_screlease( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_cas_screlease. + * + * @copydoc hsa_signal_cas_screlease + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_release( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @brief Atomically increment the value of a signal by a given amount. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to add to the value of the signal. + * + */ +void HSA_API hsa_signal_add_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_add_scacq_screl. + * + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_add_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API hsa_signal_add_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_add_scacquire. + * + * @copydoc hsa_signal_add_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_add_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API hsa_signal_add_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API hsa_signal_add_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_add_screlease. + * + * @copydoc hsa_signal_add_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_add_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically decrement the value of a signal by a given amount. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to subtract from the value of the signal. + * + */ +void HSA_API hsa_signal_subtract_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_subtract_scacq_screl. + * + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_subtract_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API hsa_signal_subtract_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_subtract_scacquire. + * + * @copydoc hsa_signal_subtract_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_subtract_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API hsa_signal_subtract_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API hsa_signal_subtract_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_subtract_screlease. + * + * @copydoc hsa_signal_subtract_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_subtract_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise AND operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to AND with the value of the signal. + * + */ +void HSA_API hsa_signal_and_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_and_scacq_screl. + * + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_and_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API hsa_signal_and_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_and_scacquire. + * + * @copydoc hsa_signal_and_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_and_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API hsa_signal_and_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API hsa_signal_and_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_and_screlease. + * + * @copydoc hsa_signal_and_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_and_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise OR operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to OR with the value of the signal. + */ +void HSA_API hsa_signal_or_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_or_scacq_screl. + * + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_or_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API hsa_signal_or_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_or_scacquire. + * + * @copydoc hsa_signal_or_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_or_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API hsa_signal_or_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API hsa_signal_or_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_or_screlease. + * + * @copydoc hsa_signal_or_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_or_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise XOR operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to XOR with the value of the signal. + * + */ +void HSA_API hsa_signal_xor_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_xor_scacq_screl. + * + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_xor_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API hsa_signal_xor_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_xor_scacquire. + * + * @copydoc hsa_signal_xor_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_xor_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API hsa_signal_xor_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API hsa_signal_xor_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_xor_screlease. + * + * @copydoc hsa_signal_xor_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_xor_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Wait condition operator. + */ +typedef enum { + /** + * The two operands are equal. + */ + HSA_SIGNAL_CONDITION_EQ = 0, + /** + * The two operands are not equal. + */ + HSA_SIGNAL_CONDITION_NE = 1, + /** + * The first operand is less than the second operand. + */ + HSA_SIGNAL_CONDITION_LT = 2, + /** + * The first operand is greater than or equal to the second operand. + */ + HSA_SIGNAL_CONDITION_GTE = 3 +} hsa_signal_condition_t; + +/** + * @brief State of the application thread during a signal wait. + */ +typedef enum { + /** + * The application thread may be rescheduled while waiting on the signal. + */ + HSA_WAIT_STATE_BLOCKED = 0, + /** + * The application thread stays active while waiting on a signal. + */ + HSA_WAIT_STATE_ACTIVE = 1 +} hsa_wait_state_t; + + +/** + * @brief Wait until a signal value satisfies a specified condition, or a + * certain amount of time has elapsed. + * + * @details A wait operation can spuriously resume at any time sooner than the + * timeout (for example, due to system or other external factors) even when the + * condition has not been met. + * + * The function is guaranteed to return if the signal value satisfies the + * condition at some point in time during the wait, but the value returned to + * the application might not satisfy the condition. The application must ensure + * that signals are used in such way that wait wakeup conditions are not + * invalidated before dependent threads have woken up. + * + * When the wait operation internally loads the value of the passed signal, it + * uses the memory order indicated in the function name. + * + * @param[in] signal Signal. + * + * @param[in] condition Condition used to compare the signal value with @p + * compare_value. + * + * @param[in] compare_value Value to compare with. + * + * @param[in] timeout_hint Maximum duration of the wait. Specified in the same + * unit as the system timestamp. The operation might block for a shorter or + * longer time even if the condition is not met. A value of UINT64_MAX indicates + * no maximum. + * + * @param[in] wait_state_hint Hint used by the application to indicate the + * preferred waiting state. The actual waiting state is ultimately decided by + * HSA runtime and may not match the provided hint. A value of + * ::HSA_WAIT_STATE_ACTIVE may improve the latency of response to a signal + * update by avoiding rescheduling overhead. + * + * @return Observed value of the signal, which might not satisfy the specified + * condition. + * +*/ +hsa_signal_value_t HSA_API hsa_signal_wait_scacquire( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @copydoc hsa_signal_wait_scacquire + */ +hsa_signal_value_t HSA_API hsa_signal_wait_relaxed( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @deprecated Renamed as ::hsa_signal_wait_scacquire. + * + * @copydoc hsa_signal_wait_scacquire + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_wait_acquire( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @brief Group of signals. + */ +typedef struct hsa_signal_group_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_signal_group_t; + +/** + * @brief Create a signal group. + * + * @param[in] num_signals Number of elements in @p signals. Must not be 0. + * + * @param[in] signals List of signals in the group. The list must not contain + * any repeated elements. Must not be NULL. + * + * @param[in] num_consumers Number of elements in @p consumers. Must not be 0. + * + * @param[in] consumers List of agents that might consume (wait on) the signal + * group. The list must not contain repeated elements, and must be a subset of + * the set of agents that are allowed to wait on all the signals in the + * group. If an agent not listed in @p consumers waits on the returned group, + * the behavior is undefined. The memory associated with @p consumers can be + * reused or freed after the function returns. Must not be NULL. + * + * @param[out] signal_group Pointer to newly created signal group. Must not be + * NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_signals is 0, @p signals + * is NULL, @p num_consumers is 0, @p consumers is NULL, or @p signal_group is + * NULL. + */ +hsa_status_t HSA_API hsa_signal_group_create( + uint32_t num_signals, + const hsa_signal_t *signals, + uint32_t num_consumers, + const hsa_agent_t *consumers, + hsa_signal_group_t *signal_group); + +/** + * @brief Destroy a signal group previous created by ::hsa_signal_group_create. + * + * @param[in] signal_group Signal group. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid. + */ +hsa_status_t HSA_API hsa_signal_group_destroy( + hsa_signal_group_t signal_group); + +/** + * @brief Wait until the value of at least one of the signals in a signal group + * satisfies its associated condition. + * + * @details The function is guaranteed to return if the value of at least one of + * the signals in the group satisfies its associated condition at some point in + * time during the wait, but the signal value returned to the application may no + * longer satisfy the condition. The application must ensure that signals in the + * group are used in such way that wait wakeup conditions are not invalidated + * before dependent threads have woken up. + * + * When this operation internally loads the value of the passed signal, it uses + * the memory order indicated in the function name. + * + * @param[in] signal_group Signal group. + * + * @param[in] conditions List of conditions. Each condition, and the value at + * the same index in @p compare_values, is used to compare the value of the + * signal at that index in @p signal_group (the signal passed by the application + * to ::hsa_signal_group_create at that particular index). The size of @p + * conditions must not be smaller than the number of signals in @p signal_group; + * any extra elements are ignored. Must not be NULL. + * + * @param[in] compare_values List of comparison values. The size of @p + * compare_values must not be smaller than the number of signals in @p + * signal_group; any extra elements are ignored. Must not be NULL. + * + * @param[in] wait_state_hint Hint used by the application to indicate the + * preferred waiting state. The actual waiting state is decided by the HSA runtime + * and may not match the provided hint. A value of ::HSA_WAIT_STATE_ACTIVE may + * improve the latency of response to a signal update by avoiding rescheduling + * overhead. + * + * @param[out] signal Signal in the group that satisfied the associated + * condition. If several signals satisfied their condition, the function can + * return any of those signals. Must not be NULL. + * + * @param[out] value Observed value for @p signal, which might no longer satisfy + * the specified condition. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p conditions is NULL, @p + * compare_values is NULL, @p signal is NULL, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_signal_group_wait_any_scacquire( + hsa_signal_group_t signal_group, + const hsa_signal_condition_t *conditions, + const hsa_signal_value_t *compare_values, + hsa_wait_state_t wait_state_hint, + hsa_signal_t *signal, + hsa_signal_value_t *value); + +/** + * @copydoc hsa_signal_group_wait_any_scacquire + */ +hsa_status_t HSA_API hsa_signal_group_wait_any_relaxed( + hsa_signal_group_t signal_group, + const hsa_signal_condition_t *conditions, + const hsa_signal_value_t *compare_values, + hsa_wait_state_t wait_state_hint, + hsa_signal_t *signal, + hsa_signal_value_t *value); + +/** @} */ + +/** \defgroup memory Memory + * @{ + */ + +/** + * @brief A memory region represents a block of virtual memory with certain + * properties. For example, the HSA runtime represents fine-grained memory in + * the global segment using a region. A region might be associated with more + * than one agent. + */ +typedef struct hsa_region_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_region_t; + +/** @} */ + + +/** \defgroup queue Queues + * @{ + */ + +/** + * @brief Queue type. Intended to be used for dynamic queue protocol + * determination. + */ +typedef enum { + /** + * Queue supports multiple producers. Use of multiproducer queue mechanics is + * required. + */ + HSA_QUEUE_TYPE_MULTI = 0, + /** + * Queue only supports a single producer. In some scenarios, the application + * may want to limit the submission of AQL packets to a single agent. Queues + * that support a single producer may be more efficient than queues supporting + * multiple producers. Use of multiproducer queue mechanics is not supported. + */ + HSA_QUEUE_TYPE_SINGLE = 1, + /** + * Queue supports multiple producers and cooperative dispatches. Cooperative + * dispatches are able to use GWS synchronization. Queues of this type may be + * limited in number. The runtime may return the same queue to serve multiple + * ::hsa_queue_create calls when this type is given. Callers must inspect the + * returned queue to discover queue size. Queues of this type are reference + * counted and require a matching number of ::hsa_queue_destroy calls to + * release. Use of multiproducer queue mechanics is required. See + * ::HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES to query agent support for this + * type. + */ + HSA_QUEUE_TYPE_COOPERATIVE = 2 +} hsa_queue_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_queue_type_t constants. + */ +typedef uint32_t hsa_queue_type32_t; + +/** + * @brief Queue features. + */ +typedef enum { + /** + * Queue supports kernel dispatch packets. + */ + HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1, + + /** + * Queue supports agent dispatch packets. + */ + HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2 +} hsa_queue_feature_t; + +/** + * @brief User mode queue. + * + * @details The queue structure is read-only and allocated by the HSA runtime, + * but agents can directly modify the contents of the buffer pointed by @a + * base_address, or use HSA runtime APIs to access the doorbell signal. + * + */ +typedef struct hsa_queue_s { + /** + * Queue type. + */ + hsa_queue_type32_t type; + + /** + * Queue features mask. This is a bit-field of ::hsa_queue_feature_t + * values. Applications should ignore any unknown set bits. + */ + uint32_t features; + +#ifdef HSA_LARGE_MODEL + void* base_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Starting address of the HSA runtime-allocated buffer used to store the AQL + * packets. Must be aligned to the size of an AQL packet. + */ + void* base_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; +#else + uint32_t reserved0; + void* base_address; +#endif + + /** + * Signal object used by the application to indicate the ID of a packet that + * is ready to be processed. The HSA runtime manages the doorbell signal. If + * the application tries to replace or destroy this signal, the behavior is + * undefined. + * + * If @a type is ::HSA_QUEUE_TYPE_SINGLE, the doorbell signal value must be + * updated in a monotonically increasing fashion. If @a type is + * ::HSA_QUEUE_TYPE_MULTI, the doorbell signal value can be updated with any + * value. + */ + hsa_signal_t doorbell_signal; + + /** + * Maximum number of packets the queue can hold. Must be a power of 2. + */ + uint32_t size; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + /** + * Queue identifier, which is unique over the lifetime of the application. + */ + uint64_t id; + +} hsa_queue_t; + +/** + * @brief Create a user mode queue. + * + * @details The HSA runtime creates the queue structure, the underlying packet + * buffer, the completion signal, and the write and read indexes. The initial + * value of the write and read indexes is 0. The type of every packet in the + * buffer is initialized to ::HSA_PACKET_TYPE_INVALID. + * + * The application should only rely on the error code returned to determine if + * the queue is valid. + * + * @param[in] agent Agent where to create the queue. + * + * @param[in] size Number of packets the queue is expected to + * hold. Must be a power of 2 between 1 and the value of + * ::HSA_AGENT_INFO_QUEUE_MAX_SIZE in @p agent. The size of the newly + * created queue is the maximum of @p size and the value of + * ::HSA_AGENT_INFO_QUEUE_MIN_SIZE in @p agent. + * + * @param[in] type Type of the queue, a bitwise OR of hsa_queue_type_t values. + * If the value of ::HSA_AGENT_INFO_QUEUE_TYPE in @p agent is ::HSA_QUEUE_TYPE_SINGLE, + * then @p type must also be ::HSA_QUEUE_TYPE_SINGLE. + * + * @param[in] callback Callback invoked by the HSA runtime for every + * asynchronous event related to the newly created queue. May be NULL. The HSA + * runtime passes three arguments to the callback: a code identifying the event + * that triggered the invocation, a pointer to the queue where the event + * originated, and the application data. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @param[in] private_segment_size Hint indicating the maximum + * expected private segment usage per work-item, in bytes. There may + * be performance degradation if the application places a kernel + * dispatch packet in the queue and the corresponding private segment + * usage exceeds @p private_segment_size. If the application does not + * want to specify any particular value for this argument, @p + * private_segment_size must be UINT32_MAX. If the queue does not + * support kernel dispatch packets, this argument is ignored. + * + * @param[in] group_segment_size Hint indicating the maximum expected + * group segment usage per work-group, in bytes. There may be + * performance degradation if the application places a kernel dispatch + * packet in the queue and the corresponding group segment usage + * exceeds @p group_segment_size. If the application does not want to + * specify any particular value for this argument, @p + * group_segment_size must be UINT32_MAX. If the queue does not + * support kernel dispatch packets, this argument is ignored. + * + * @param[out] queue Memory location where the HSA runtime stores a pointer to + * the newly created queue. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p agent does not + * support queues of the given type. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, + * @p size is 0, @p type is an invalid queue type, or @p queue is NULL. + * + */ +hsa_status_t HSA_API hsa_queue_create( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue); + +/** + * @brief Create a queue for which the application or a kernel is responsible + * for processing the AQL packets. + * + * @details The application can use this function to create queues where AQL + * packets are not parsed by the packet processor associated with an agent, + * but rather by a unit of execution running on that agent (for example, a + * thread in the host application). + * + * The application is responsible for ensuring that all the producers and + * consumers of the resulting queue can access the provided doorbell signal + * and memory region. The application is also responsible for ensuring that the + * unit of execution processing the queue packets supports the indicated + * features (AQL packet types). + * + * When the queue is created, the HSA runtime allocates the packet buffer using + * @p region, and the write and read indexes. The initial value of the write and + * read indexes is 0, and the type of every packet in the buffer is initialized + * to ::HSA_PACKET_TYPE_INVALID. The value of the @e size, @e type, @e features, + * and @e doorbell_signal fields in the returned queue match the values passed + * by the application. + * + * @param[in] region Memory region that the HSA runtime should use to allocate + * the AQL packet buffer and any other queue metadata. + * + * @param[in] size Number of packets the queue is expected to hold. Must be a + * power of 2 greater than 0. + * + * @param[in] type Queue type. + * + * @param[in] features Supported queue features. This is a bit-field of + * ::hsa_queue_feature_t values. + * + * @param[in] doorbell_signal Doorbell signal that the HSA runtime must + * associate with the returned queue. The signal handle must not be 0. + * + * @param[out] queue Memory location where the HSA runtime stores a pointer to + * the newly created queue. The application should not rely on the value + * returned for this argument but only in the status code to determine if the + * queue is valid. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, @p + * size is 0, @p type is an invalid queue type, the doorbell signal handle is + * 0, or @p queue is NULL. + * + */ +hsa_status_t HSA_API hsa_soft_queue_create( + hsa_region_t region, + uint32_t size, + hsa_queue_type32_t type, + uint32_t features, + hsa_signal_t doorbell_signal, + hsa_queue_t **queue); + +/** + * @brief Destroy a user mode queue. + * + * @details When a queue is destroyed, the state of the AQL packets that have + * not been yet fully processed (their completion phase has not finished) + * becomes undefined. It is the responsibility of the application to ensure that + * all pending queue operations are finished if their results are required. + * + * The resources allocated by the HSA runtime during queue creation (queue + * structure, ring buffer, doorbell signal) are released. The queue should not + * be accessed after being destroyed. + * + * @param[in] queue Pointer to a queue created using ::hsa_queue_create. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API hsa_queue_destroy( + hsa_queue_t *queue); + +/** + * @brief Inactivate a queue. + * + * @details Inactivating the queue aborts any pending executions and prevent any + * new packets from being processed. Any more packets written to the queue once + * it is inactivated will be ignored by the packet processor. + * + * @param[in] queue Pointer to a queue. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API hsa_queue_inactivate( + hsa_queue_t *queue); + +/** + * @deprecated Renamed as ::hsa_queue_load_read_index_scacquire. + * + * @copydoc hsa_queue_load_read_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_read_index_acquire( + const hsa_queue_t *queue); + +/** + * @brief Atomically load the read index of a queue. + * + * @param[in] queue Pointer to a queue. + * + * @return Read index of the queue pointed by @p queue. + */ +uint64_t HSA_API hsa_queue_load_read_index_scacquire( + const hsa_queue_t *queue); + +/** + * @copydoc hsa_queue_load_read_index_scacquire + */ +uint64_t HSA_API hsa_queue_load_read_index_relaxed( + const hsa_queue_t *queue); + +/** + * @deprecated Renamed as ::hsa_queue_load_write_index_scacquire. + * + * @copydoc hsa_queue_load_write_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_write_index_acquire( + const hsa_queue_t *queue); + +/** + * @brief Atomically load the write index of a queue. + * + * @param[in] queue Pointer to a queue. + * + * @return Write index of the queue pointed by @p queue. + */ +uint64_t HSA_API hsa_queue_load_write_index_scacquire( + const hsa_queue_t *queue); + +/** + * @copydoc hsa_queue_load_write_index_scacquire + */ +uint64_t HSA_API hsa_queue_load_write_index_relaxed( + const hsa_queue_t *queue); + +/** + * @brief Atomically set the write index of a queue. + * + * @details It is recommended that the application uses this function to update + * the write index when there is a single agent submitting work to the queue + * (the queue type is ::HSA_QUEUE_TYPE_SINGLE). + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to assign to the write index. + * + */ +void HSA_API hsa_queue_store_write_index_relaxed( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_store_write_index_screlease. + * + * @copydoc hsa_queue_store_write_index_screlease + */ +void HSA_API HSA_DEPRECATED hsa_queue_store_write_index_release( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_store_write_index_relaxed + */ +void HSA_API hsa_queue_store_write_index_screlease( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_cas_write_index_scacq_screl. + * + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acq_rel( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @brief Atomically set the write index of a queue if the observed value is + * equal to the expected value. The application can inspect the returned value + * to determine if the replacement was done. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] expected Expected value. + * + * @param[in] value Value to assign to the write index if @p expected matches + * the observed write index. Must be greater than @p expected. + * + * @return Previous value of the write index. + */ +uint64_t HSA_API hsa_queue_cas_write_index_scacq_screl( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_cas_write_index_scacquire. + * + * @copydoc hsa_queue_cas_write_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acquire( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_cas_write_index_scacquire( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_cas_write_index_relaxed( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_cas_write_index_screlease. + * + * @copydoc hsa_queue_cas_write_index_screlease + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_release( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_cas_write_index_screlease( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_add_write_index_scacq_screl. + * + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acq_rel( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @brief Atomically increment the write index of a queue by an offset. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to add to the write index. + * + * @return Previous value of the write index. + */ +uint64_t HSA_API hsa_queue_add_write_index_scacq_screl( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_add_write_index_scacquire. + * + * @copydoc hsa_queue_add_write_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acquire( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_add_write_index_scacquire( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_add_write_index_relaxed( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_add_write_index_screlease. + * + * @copydoc hsa_queue_add_write_index_screlease + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_release( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_add_write_index_screlease( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @brief Atomically set the read index of a queue. + * + * @details Modifications of the read index are not allowed and result in + * undefined behavior if the queue is associated with an agent for which + * only the corresponding packet processor is permitted to update the read + * index. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to assign to the read index. + * + */ +void HSA_API hsa_queue_store_read_index_relaxed( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_store_read_index_screlease. + * + * @copydoc hsa_queue_store_read_index_screlease + */ +void HSA_API HSA_DEPRECATED hsa_queue_store_read_index_release( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_store_read_index_relaxed + */ +void HSA_API hsa_queue_store_read_index_screlease( + const hsa_queue_t *queue, + uint64_t value); +/** @} */ + + +/** \defgroup aql Architected Queuing Language + * @{ + */ + +/** + * @brief Packet type. + */ +typedef enum { + /** + * Vendor-specific packet. + */ + HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0, + /** + * The packet has been processed in the past, but has not been reassigned to + * the packet processor. A packet processor must not process a packet of this + * type. All queues support this packet type. + */ + HSA_PACKET_TYPE_INVALID = 1, + /** + * Packet used by agents for dispatching jobs to kernel agents. Not all + * queues support packets of this type (see ::hsa_queue_feature_t). + */ + HSA_PACKET_TYPE_KERNEL_DISPATCH = 2, + /** + * Packet used by agents to delay processing of subsequent packets, and to + * express complex dependencies between multiple packets. All queues support + * this packet type. + */ + HSA_PACKET_TYPE_BARRIER_AND = 3, + /** + * Packet used by agents for dispatching jobs to agents. Not all + * queues support packets of this type (see ::hsa_queue_feature_t). + */ + HSA_PACKET_TYPE_AGENT_DISPATCH = 4, + /** + * Packet used by agents to delay processing of subsequent packets, and to + * express complex dependencies between multiple packets. All queues support + * this packet type. + */ + HSA_PACKET_TYPE_BARRIER_OR = 5 +} hsa_packet_type_t; + +/** + * @brief Scope of the memory fence operation associated with a packet. + */ +typedef enum { + /** + * No scope (no fence is applied). The packet relies on external fences to + * ensure visibility of memory updates. + */ + HSA_FENCE_SCOPE_NONE = 0, + /** + * The fence is applied with agent scope for the global segment. + */ + HSA_FENCE_SCOPE_AGENT = 1, + /** + * The fence is applied across both agent and system scope for the global + * segment. + */ + HSA_FENCE_SCOPE_SYSTEM = 2 +} hsa_fence_scope_t; + +/** + * @brief Sub-fields of the @a header field that is present in any AQL + * packet. The offset (with respect to the address of @a header) of a sub-field + * is identical to its enumeration constant. The width of each sub-field is + * determined by the corresponding value in ::hsa_packet_header_width_t. The + * offset and the width are expressed in bits. + */ + typedef enum { + /** + * Packet type. The value of this sub-field must be one of + * ::hsa_packet_type_t. If the type is ::HSA_PACKET_TYPE_VENDOR_SPECIFIC, the + * packet layout is vendor-specific. + */ + HSA_PACKET_HEADER_TYPE = 0, + /** + * Barrier bit. If the barrier bit is set, the processing of the current + * packet only launches when all preceding packets (within the same queue) are + * complete. + */ + HSA_PACKET_HEADER_BARRIER = 8, + /** + * Acquire fence scope. The value of this sub-field determines the scope and + * type of the memory fence operation applied before the packet enters the + * active phase. An acquire fence ensures that any subsequent global segment + * or image loads by any unit of execution that belongs to a dispatch that has + * not yet entered the active phase on any queue of the same kernel agent, + * sees any data previously released at the scopes specified by the acquire + * fence. The value of this sub-field must be one of ::hsa_fence_scope_t. + */ + HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE = 9, + /** + * @deprecated Renamed as ::HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9, + /** + * Release fence scope, The value of this sub-field determines the scope and + * type of the memory fence operation applied after kernel completion but + * before the packet is completed. A release fence makes any global segment or + * image data that was stored by any unit of execution that belonged to a + * dispatch that has completed the active phase on any queue of the same + * kernel agent visible in all the scopes specified by the release fence. The + * value of this sub-field must be one of ::hsa_fence_scope_t. + */ + HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE = 11, + /** + * @deprecated Renamed as ::HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11 + } hsa_packet_header_t; + +/** + * @brief Width (in bits) of the sub-fields in ::hsa_packet_header_t. + */ + typedef enum { + HSA_PACKET_HEADER_WIDTH_TYPE = 8, + HSA_PACKET_HEADER_WIDTH_BARRIER = 1, + HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE = 2, + /** + * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_WIDTH_ACQUIRE_FENCE_SCOPE = 2, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE = 2, + /** + * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_WIDTH_RELEASE_FENCE_SCOPE = 2 + } hsa_packet_header_width_t; + +/** + * @brief Sub-fields of the kernel dispatch packet @a setup field. The offset + * (with respect to the address of @a setup) of a sub-field is identical to its + * enumeration constant. The width of each sub-field is determined by the + * corresponding value in ::hsa_kernel_dispatch_packet_setup_width_t. The + * offset and the width are expressed in bits. + */ + typedef enum { + /** + * Number of dimensions of the grid. Valid values are 1, 2, or 3. + * + */ + HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0 + } hsa_kernel_dispatch_packet_setup_t; + +/** + * @brief Width (in bits) of the sub-fields in + * ::hsa_kernel_dispatch_packet_setup_t. + */ + typedef enum { + HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS = 2 + } hsa_kernel_dispatch_packet_setup_width_t; + +/** + * @brief AQL kernel dispatch packet + */ +typedef struct hsa_kernel_dispatch_packet_s { + union { + struct { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Dispatch setup parameters. Used to configure kernel dispatch parameters + * such as the number of dimensions in the grid. The parameters are described + * by ::hsa_kernel_dispatch_packet_setup_t. + */ + uint16_t setup; + }; + uint32_t full_header; + }; + + /** + * X dimension of work-group, in work-items. Must be greater than 0. + */ + uint16_t workgroup_size_x; + + /** + * Y dimension of work-group, in work-items. Must be greater than + * 0. If the grid has 1 dimension, the only valid value is 1. + */ + uint16_t workgroup_size_y; + + /** + * Z dimension of work-group, in work-items. Must be greater than + * 0. If the grid has 1 or 2 dimensions, the only valid value is 1. + */ + uint16_t workgroup_size_z; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * X dimension of grid, in work-items. Must be greater than 0. Must + * not be smaller than @a workgroup_size_x. + */ + uint32_t grid_size_x; + + /** + * Y dimension of grid, in work-items. Must be greater than 0. If the grid has + * 1 dimension, the only valid value is 1. Must not be smaller than @a + * workgroup_size_y. + */ + uint32_t grid_size_y; + + /** + * Z dimension of grid, in work-items. Must be greater than 0. If the grid has + * 1 or 2 dimensions, the only valid value is 1. Must not be smaller than @a + * workgroup_size_z. + */ + uint32_t grid_size_z; + + /** + * Size in bytes of private memory allocation request (per work-item). + */ + uint32_t private_segment_size; + + /** + * Size in bytes of group memory allocation request (per work-group). Must not + * be less than the sum of the group memory used by the kernel (and the + * functions it calls directly or indirectly) and the dynamically allocated + * group segment variables. + */ + uint32_t group_segment_size; + + /** + * Opaque handle to a code object that includes an implementation-defined + * executable code for the kernel. + */ + uint64_t kernel_object; + +#ifdef HSA_LARGE_MODEL + void* kernarg_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Pointer to a buffer containing the kernel arguments. May be NULL. + * + * The buffer must be allocated using ::hsa_memory_allocate, and must not be + * modified once the kernel dispatch packet is enqueued until the dispatch has + * completed execution. + */ + void* kernarg_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; +#else + uint32_t reserved1; + void* kernarg_address; +#endif + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_kernel_dispatch_packet_t; + +/** + * @brief Agent dispatch packet. + */ +typedef struct hsa_agent_dispatch_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Application-defined function to be performed by the destination agent. + */ + uint16_t type; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; + +#ifdef HSA_LARGE_MODEL + void* return_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Address where to store the function return values, if any. + */ + void* return_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; +#else + uint32_t reserved1; + void* return_address; +#endif + + /** + * Function arguments. + */ + uint64_t arg[4]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_agent_dispatch_packet_t; + +/** + * @brief Barrier-AND packet. + */ +typedef struct hsa_barrier_and_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Array of dependent signal objects. Signals with a handle value of 0 are + * allowed and are interpreted by the packet processor as satisfied + * dependencies. + */ + hsa_signal_t dep_signal[5]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_barrier_and_packet_t; + +/** + * @brief Barrier-OR packet. + */ +typedef struct hsa_barrier_or_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Array of dependent signal objects. Signals with a handle value of 0 are + * allowed and are interpreted by the packet processor as dependencies not + * satisfied. + */ + hsa_signal_t dep_signal[5]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_barrier_or_packet_t; + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Memory segments associated with a region. + */ +typedef enum { + /** + * Global segment. Used to hold data that is shared by all agents. + */ + HSA_REGION_SEGMENT_GLOBAL = 0, + /** + * Read-only segment. Used to hold data that remains constant during the + * execution of a kernel. + */ + HSA_REGION_SEGMENT_READONLY = 1, + /** + * Private segment. Used to hold data that is local to a single work-item. + */ + HSA_REGION_SEGMENT_PRIVATE = 2, + /** + * Group segment. Used to hold data that is shared by the work-items of a + * work-group. + */ + HSA_REGION_SEGMENT_GROUP = 3, + /** + * Kernarg segment. Used to store kernel arguments. + */ + HSA_REGION_SEGMENT_KERNARG = 4 +} hsa_region_segment_t; + +/** + * @brief Global region flags. + */ +typedef enum { + /** + * The application can use memory in the region to store kernel arguments, and + * provide the values for the kernarg segment of a kernel dispatch. If this + * flag is set, then ::HSA_REGION_GLOBAL_FLAG_FINE_GRAINED must be set. + */ + HSA_REGION_GLOBAL_FLAG_KERNARG = 1, + /** + * Updates to memory in this region are immediately visible to all the + * agents under the terms of the HSA memory model. If this + * flag is set, then ::HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED must not be set. + */ + HSA_REGION_GLOBAL_FLAG_FINE_GRAINED = 2, + /** + * Updates to memory in this region can be performed by a single agent at + * a time. If a different agent in the system is allowed to access the + * region, the application must explicitely invoke ::hsa_memory_assign_agent + * in order to transfer ownership to that agent for a particular buffer. + */ + HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4, + + /** + * Updates to memory in this region have extended scope, where the device-scope atomics + * to this memory type act as system-scope with respect to all variables located in + * memory regions of this type. + * Note: On non-compliant systems, the application may still be responsible for performing + * device-specific actions necessary to achieve system-scope coherence. + */ + HSA_REGION_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8 +} hsa_region_global_flag_t; + +/** + * @brief Attributes of a memory region. + */ + +#ifdef __cplusplus +typedef enum : int { +#else +typedef enum { +#endif + /** + * Segment where memory in the region can be used. The type of this + * attribute is ::hsa_region_segment_t. + */ + HSA_REGION_INFO_SEGMENT = 0, + /** + * Flag mask. The value of this attribute is undefined if the value of + * ::HSA_REGION_INFO_SEGMENT is not ::HSA_REGION_SEGMENT_GLOBAL. The type of + * this attribute is uint32_t, a bit-field of ::hsa_region_global_flag_t + * values. + */ + HSA_REGION_INFO_GLOBAL_FLAGS = 1, + /** + * Size of this region, in bytes. The type of this attribute is size_t. + */ + HSA_REGION_INFO_SIZE = 2, + /** + * Maximum allocation size in this region, in bytes. Must not exceed the value + * of ::HSA_REGION_INFO_SIZE. The type of this attribute is size_t. + * + * If the region is in the global or readonly segments, this is the maximum + * size that the application can pass to ::hsa_memory_allocate. + * + * If the region is in the group segment, this is the maximum size (per + * work-group) that can be requested for a given kernel dispatch. If the + * region is in the private segment, this is the maximum size (per work-item) + * that can be requested for a specific kernel dispatch, and must be at least + * 256 bytes. + */ + HSA_REGION_INFO_ALLOC_MAX_SIZE = 4, + /** + * Maximum size (per work-group) of private memory that can be requested for a + * specific kernel dispatch. Must be at least 65536 bytes. The type of this + * attribute is uint32_t. The value of this attribute is undefined if the + * region is not in the private segment. + */ + HSA_REGION_INFO_ALLOC_MAX_PRIVATE_WORKGROUP_SIZE = 8, + /** + * Indicates whether memory in this region can be allocated using + * ::hsa_memory_allocate. The type of this attribute is bool. + * + * The value of this flag is always false for regions in the group and private + * segments. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED = 5, + /** + * Allocation granularity of buffers allocated by ::hsa_memory_allocate in + * this region. The size of a buffer allocated in this region is a multiple of + * the value of this attribute. The value of this attribute is only defined if + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region. The type + * of this attribute is size_t. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE = 6, + /** + * Alignment of buffers allocated by ::hsa_memory_allocate in this region. The + * value of this attribute is only defined if + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region, and must be + * a power of 2. The type of this attribute is size_t. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT = 7 +} hsa_region_info_t; + +/** + * @brief Get the current value of an attribute of a region. + * + * @param[in] region A valid region. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * region attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_region_get_info( + hsa_region_t region, + hsa_region_info_t attribute, + void* value); + +/** + * @brief Iterate over the memory regions associated with a given agent, and + * invoke an application-defined callback on every iteration. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per region that is + * accessible from the agent. The HSA runtime passes two arguments to the + * callback, the region and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and ::hsa_agent_iterate_regions returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_regions( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data); + +/** + * @brief Allocate a block of memory in a given region. + * + * @param[in] region Region where to allocate memory from. The region must have + * the ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED flag set. + * + * @param[in] size Allocation size, in bytes. Must not be zero. This value is + * rounded up to the nearest multiple of ::HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE + * in @p region. + * + * @param[out] ptr Pointer to the location where to store the base address of + * the allocated block. The returned base address is aligned to the value of + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT in @p region. If the allocation + * fails, the returned value is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to + * allocate memory in @p region, or @p size is greater than the value of + * HSA_REGION_INFO_ALLOC_MAX_SIZE in @p region. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0. + */ +hsa_status_t HSA_API hsa_memory_allocate(hsa_region_t region, + size_t size, + void** ptr); + +/** + * @brief Deallocate a block of memory previously allocated using + * ::hsa_memory_allocate. + * + * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value + * previously returned by ::hsa_memory_allocate, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + */ +hsa_status_t HSA_API hsa_memory_free(void* ptr); + +/** + * @brief Copy a block of memory from the location pointed to by @p src to the + * memory block pointed to by @p dst. + * + * @param[out] dst Buffer where the content is to be copied. If @p dst is in + * coarse-grained memory, the copied data is only visible to the agent currently + * assigned (::hsa_memory_assign_agent) to @p dst. + * + * @param[in] src A valid pointer to the source of data to be copied. The source + * buffer must not overlap with the destination buffer. If the source buffer is + * in coarse-grained memory then it must be assigned to an agent, from which the + * data will be retrieved. + * + * @param[in] size Number of bytes to copy. If @p size is 0, no copy is + * performed and the function returns success. Copying a number of bytes larger + * than the size of the buffers pointed by @p dst or @p src results in undefined + * behavior. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination + * pointers are NULL. + */ +hsa_status_t HSA_API hsa_memory_copy( + void *dst, + const void *src, + size_t size); + +/** + * @brief Change the ownership of a global, coarse-grained buffer. + * + * @details The contents of a coarse-grained buffer are visible to an agent + * only after ownership has been explicitely transferred to that agent. Once the + * operation completes, the previous owner cannot longer access the data in the + * buffer. + * + * An implementation of the HSA runtime is allowed, but not required, to change + * the physical location of the buffer when ownership is transferred to a + * different agent. In general the application must not assume this + * behavior. The virtual location (address) of the passed buffer is never + * modified. + * + * @param[in] ptr Base address of a global buffer. The pointer must match an + * address previously returned by ::hsa_memory_allocate. The size of the buffer + * affected by the ownership change is identical to the size of that previous + * allocation. If @p ptr points to a fine-grained global buffer, no operation is + * performed and the function returns success. If @p ptr does not point to + * global memory, the behavior is undefined. + * + * @param[in] agent Agent that becomes the owner of the buffer. The + * application is responsible for ensuring that @p agent has access to the + * region that contains the buffer. It is allowed to change ownership to an + * agent that is already the owner of the buffer, with the same or different + * access permissions. + * + * @param[in] access Access permissions requested for the new owner. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p access is + * not a valid access value. + */ +hsa_status_t HSA_API hsa_memory_assign_agent( + void *ptr, + hsa_agent_t agent, + hsa_access_permission_t access); + +/** + * + * @brief Register a global, fine-grained buffer. + * + * @details Registering a buffer serves as an indication to the HSA runtime that + * the memory might be accessed from a kernel agent other than the + * host. Registration is a performance hint that allows the HSA runtime + * implementation to know which buffers will be accessed by some of the kernel + * agents ahead of time. + * + * Registration is only recommended for buffers in the global segment that have + * not been allocated using the HSA allocator (::hsa_memory_allocate), but an OS + * allocator instead. Registering an OS-allocated buffer in the base profile is + * equivalent to a no-op. + * + * Registrations should not overlap. + * + * @param[in] ptr A buffer in global, fine-grained memory. If a NULL pointer is + * passed, no operation is performed. If the buffer has been allocated using + * ::hsa_memory_allocate, or has already been registered, no operation is + * performed. + * + * @param[in] size Requested registration size in bytes. A size of 0 is + * only allowed if @p ptr is NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 but @p ptr + * is not NULL. + */ +hsa_status_t HSA_API hsa_memory_register( + void *ptr, + size_t size); + +/** + * + * @brief Deregister memory previously registered using ::hsa_memory_register. + * + * @details If the memory interval being deregistered does not match a previous + * registration (start and end addresses), the behavior is undefined. + * + * @param[in] ptr A pointer to the base of the buffer to be deregistered. If + * a NULL pointer is passed, no operation is performed. + * + * @param[in] size Size of the buffer to be deregistered. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_memory_deregister( + void *ptr, + size_t size); + +/** @} */ + + +/** \defgroup instruction-set-architecture Instruction Set Architecture. + * @{ + */ + +/** + * @brief Instruction set architecture. + */ +typedef struct hsa_isa_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_isa_t; + +/** + * @brief Retrieve a reference to an instruction set architecture handle out of + * a symbolic name. + * + * @param[in] name Vendor-specific name associated with a a particular + * instruction set architecture. @p name must start with the vendor name and a + * colon (for example, "AMD:"). The rest of the name is vendor-specific. Must be + * a NUL-terminated string. + * + * @param[out] isa Memory location where the HSA runtime stores the ISA handle + * corresponding to the given name. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA_NAME The given name does not + * correspond to any instruction set architecture. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p name is NULL, or @p isa is + * NULL. + */ +hsa_status_t HSA_API hsa_isa_from_name( + const char *name, + hsa_isa_t *isa); + +/** + * @brief Iterate over the instruction sets supported by the given agent, and + * invoke an application-defined callback on every iteration. The iterator is + * deterministic: if an agent supports several instruction set architectures, + * they are traversed in the same order in every invocation of this function. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per instruction set + * architecture. The HSA runtime passes two arguments to the callback: the + * ISA and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * that status value is returned. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_isas( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_isa_t isa, void *data), + void *data); + +/** + * @brief Instruction set architecture attributes. + */ +typedef enum { + /** + * The length of the ISA name in bytes, not including the NUL terminator. The + * type of this attribute is uint32_t. + */ + HSA_ISA_INFO_NAME_LENGTH = 0, + /** + * Human-readable description. The type of this attribute is character array + * with the length equal to the value of ::HSA_ISA_INFO_NAME_LENGTH attribute. + */ + HSA_ISA_INFO_NAME = 1, + /** + * @deprecated + * + * Number of call conventions supported by the instruction set architecture. + * Must be greater than zero. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_COUNT = 2, + /** + * @deprecated + * + * Number of work-items in a wavefront for a given call convention. Must be a + * power of 2 in the range [1,256]. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONT_SIZE = 3, + /** + * @deprecated + * + * Number of wavefronts per compute unit for a given call convention. In + * practice, other factors (for example, the amount of group memory used by a + * work-group) may further limit the number of wavefronts per compute + * unit. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONTS_PER_COMPUTE_UNIT = 4, + /** + * Machine models supported by the instruction set architecture. The type of + * this attribute is a bool[2]. If the ISA supports the small machine model, + * the element at index ::HSA_MACHINE_MODEL_SMALL is true. If the ISA supports + * the large model, the element at index ::HSA_MACHINE_MODEL_LARGE is true. + */ + HSA_ISA_INFO_MACHINE_MODELS = 5, + /** + * Profiles supported by the instruction set architecture. The type of this + * attribute is a bool[2]. If the ISA supports the base profile, the element + * at index ::HSA_PROFILE_BASE is true. If the ISA supports the full profile, + * the element at index ::HSA_PROFILE_FULL is true. + */ + HSA_ISA_INFO_PROFILES = 6, + /** + * Default floating-point rounding modes supported by the instruction set + * architecture. The type of this attribute is a bool[3]. The value at a given + * index is true if the corresponding rounding mode in + * ::hsa_default_float_rounding_mode_t is supported. At least one default mode + * has to be supported. + * + * If the default mode is supported, then + * ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES must report that + * both the zero and the near roundings modes are supported. + */ + HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES = 7, + /** + * Default floating-point rounding modes supported by the instruction set + * architecture in the Base profile. The type of this attribute is a + * bool[3]. The value at a given index is true if the corresponding rounding + * mode in ::hsa_default_float_rounding_mode_t is supported. The value at + * index HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT must be false. At least one + * of the values at indexes ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO or + * HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR must be true. + */ + HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 8, + /** + * Flag indicating that the f16 HSAIL operation is at least as fast as the + * f32 operation in the instruction set architecture. The type of this + * attribute is bool. + */ + HSA_ISA_INFO_FAST_F16_OPERATION = 9, + /** + * Maximum number of work-items of each dimension of a work-group. Each + * maximum must be greater than 0. No maximum can exceed the value of + * ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE. The type of this attribute is + * uint16_t[3]. + */ + HSA_ISA_INFO_WORKGROUP_MAX_DIM = 12, + /** + * Maximum total number of work-items in a work-group. The type + * of this attribute is uint32_t. + */ + HSA_ISA_INFO_WORKGROUP_MAX_SIZE = 13, + /** + * Maximum number of work-items of each dimension of a grid. Each maximum must + * be greater than 0, and must not be smaller than the corresponding value in + * ::HSA_ISA_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of + * ::HSA_ISA_INFO_GRID_MAX_SIZE. The type of this attribute is + * ::hsa_dim3_t. + */ + HSA_ISA_INFO_GRID_MAX_DIM = 14, + /** + * Maximum total number of work-items in a grid. The type of this + * attribute is uint64_t. + */ + HSA_ISA_INFO_GRID_MAX_SIZE = 16, + /** + * Maximum number of fbarriers per work-group. Must be at least 32. The + * type of this attribute is uint32_t. + */ + HSA_ISA_INFO_FBARRIER_MAX_SIZE = 17 +} hsa_isa_info_t; + +/** + * @deprecated The concept of call convention has been deprecated. If the + * application wants to query the value of an attribute for a given instruction + * set architecture, use ::hsa_isa_get_info_alt instead. If the application + * wants to query an attribute that is specific to a given combination of ISA + * and wavefront, use ::hsa_wavefront_get_info. + * + * @brief Get the current value of an attribute for a given instruction set + * architecture (ISA). + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] attribute Attribute to query. + * + * @param[in] index Call convention index. Used only for call convention + * attributes, otherwise ignored. Must have a value between 0 (inclusive) and + * the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT (not + * inclusive) in @p isa. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_INDEX The index is out of range. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is + * NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_get_info( + hsa_isa_t isa, + hsa_isa_info_t attribute, + uint32_t index, + void *value); + +/** + * @brief Get the current value of an attribute for a given instruction set + * architecture (ISA). + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is + * NULL. + */ +hsa_status_t HSA_API hsa_isa_get_info_alt( + hsa_isa_t isa, + hsa_isa_info_t attribute, + void *value); + +/** + * @brief Retrieve the exception policy support for a given combination of + * instruction set architecture and profile. + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] profile Profile. + * + * @param[out] mask Pointer to a memory location where the HSA runtime stores a + * mask of ::hsa_exception_policy_t values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid + * profile, or @p mask is NULL. + */ +hsa_status_t HSA_API hsa_isa_get_exception_policies( + hsa_isa_t isa, + hsa_profile_t profile, + uint16_t *mask); + +/** + * @brief Floating-point types. + */ +typedef enum { + /** + * 16-bit floating-point type. + */ + HSA_FP_TYPE_16 = 1, + /** + * 32-bit floating-point type. + */ + HSA_FP_TYPE_32 = 2, + /** + * 64-bit floating-point type. + */ + HSA_FP_TYPE_64 = 4 +} hsa_fp_type_t; + +/** + * @brief Flush to zero modes. + */ +typedef enum { + /** + * Flush to zero. + */ + HSA_FLUSH_MODE_FTZ = 1, + /** + * Do not flush to zero. + */ + HSA_FLUSH_MODE_NON_FTZ = 2 +} hsa_flush_mode_t; + +/** + * @brief Round methods. + */ +typedef enum { + /** + * Single round method. + */ + HSA_ROUND_METHOD_SINGLE = 1, + /** + * Double round method. + */ + HSA_ROUND_METHOD_DOUBLE = 2 +} hsa_round_method_t; + +/** + * @brief Retrieve the round method (single or double) used to implement the + * floating-point multiply add instruction (mad) for a given combination of + * instruction set architecture, floating-point type, and flush to zero + * modifier. + * + * @param[in] isa Instruction set architecture. + * + * @param[in] fp_type Floating-point type. + * + * @param[in] flush_mode Flush to zero modifier. + * + * @param[out] round_method Pointer to a memory location where the HSA + * runtime stores the round method used by the implementation. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fp_type is not a valid + * floating-point type, or @p flush_mode is not a valid flush to zero modifier, + * or @p round_method is NULL. + */ +hsa_status_t HSA_API hsa_isa_get_round_method( + hsa_isa_t isa, + hsa_fp_type_t fp_type, + hsa_flush_mode_t flush_mode, + hsa_round_method_t *round_method); + +/** + * @brief Wavefront handle + */ +typedef struct hsa_wavefront_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_wavefront_t; + +/** + * @brief Wavefront attributes. + */ +typedef enum { + /** + * Number of work-items in the wavefront. Must be a power of 2 in the range + * [1,256]. The type of this attribute is uint32_t. + */ + HSA_WAVEFRONT_INFO_SIZE = 0 +} hsa_wavefront_info_t; + +/** + * @brief Get the current value of a wavefront attribute. + * + * @param[in] wavefront A wavefront. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_WAVEFRONT The wavefront is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * wavefront attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_wavefront_get_info( + hsa_wavefront_t wavefront, + hsa_wavefront_info_t attribute, + void *value); + +/** + * @brief Iterate over the different wavefronts supported by an instruction set + * architecture, and invoke an application-defined callback on every iteration. + * + * @param[in] isa Instruction set architecture. + * + * @param[in] callback Callback to be invoked once per wavefront that is + * supported by the agent. The HSA runtime passes two arguments to the callback: + * the wavefront handle and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and that value is returned. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_isa_iterate_wavefronts( + hsa_isa_t isa, + hsa_status_t (*callback)(hsa_wavefront_t wavefront, void *data), + void *data); + +/** + * @deprecated Use ::hsa_agent_iterate_isas to query which instructions set + * architectures are supported by a given agent. + * + * @brief Check if the instruction set architecture of a code object can be + * executed on an agent associated with another architecture. + * + * @param[in] code_object_isa Instruction set architecture associated with a + * code object. + * + * @param[in] agent_isa Instruction set architecture associated with an agent. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. If the two architectures are compatible, the result + * is true; if they are incompatible, the result is false. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p code_object_isa or @p agent_isa are + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_compatible( + hsa_isa_t code_object_isa, + hsa_isa_t agent_isa, + bool *result); + +/** @} */ + + +/** \defgroup executable Executable + * @{ + */ + +/** + * @brief Code object reader handle. A code object reader is used to + * load a code object from file (when created using + * ::hsa_code_object_reader_create_from_file), or from memory (if created using + * ::hsa_code_object_reader_create_from_memory). + */ +typedef struct hsa_code_object_reader_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_code_object_reader_t; + +/** + * @brief Create a code object reader to operate on a file. + * + * @param[in] file File descriptor. The file must have been opened by + * application with at least read permissions prior calling this function. The + * file must contain a vendor-specific code object. + * + * The file is owned and managed by the application; the lifetime of the file + * descriptor must exceed that of any associated code object reader. + * + * @param[out] code_object_reader Memory location to store the newly created + * code object reader handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL. + */ +hsa_status_t HSA_API hsa_code_object_reader_create_from_file( + hsa_file_t file, + hsa_code_object_reader_t *code_object_reader); + +/** + * @brief Create a code object reader to operate on memory. + * + * @param[in] code_object Memory buffer that contains a vendor-specific code + * object. The buffer is owned and managed by the application; the lifetime of + * the buffer must exceed that of any associated code object reader. + * + * @param[in] size Size of the buffer pointed to by @p code_object. Must not be + * 0. + * + * @param[out] code_object_reader Memory location to store newly created code + * object reader handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object is NULL, @p size + * is zero, or @p code_object_reader is NULL. + */ +hsa_status_t HSA_API hsa_code_object_reader_create_from_memory( + const void *code_object, + size_t size, + hsa_code_object_reader_t *code_object_reader); + +/** + * @brief Destroy a code object reader. + * + * @details The code object reader handle becomes invalid after completion of + * this function. Any file or memory used to create the code object read is not + * closed, removed, or deallocated by this function. + * + * @param[in] code_object_reader Code object reader to destroy. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader + * is invalid. + */ +hsa_status_t HSA_API hsa_code_object_reader_destroy( + hsa_code_object_reader_t code_object_reader); + +/** + * @brief Struct containing an opaque handle to an executable, which contains + * ISA for finalized kernels and indirect functions together with the allocated + * global or readonly segment variables they reference. + */ +typedef struct hsa_executable_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_executable_t; + +/** + * @brief Executable state. + */ +typedef enum { + /** + * Executable state, which allows the user to load code objects and define + * external variables. Variable addresses, kernel code handles, and + * indirect function code handles are not available in query operations until + * the executable is frozen (zero always returned). + */ + HSA_EXECUTABLE_STATE_UNFROZEN = 0, + /** + * Executable state, which allows the user to query variable addresses, + * kernel code handles, and indirect function code handles using query + * operations. Loading new code objects, as well as defining external + * variables, is not allowed in this state. + */ + HSA_EXECUTABLE_STATE_FROZEN = 1 +} hsa_executable_state_t; + +/** + * @deprecated Use ::hsa_executable_create_alt instead, which allows the + * application to specify the default floating-point rounding mode of the + * executable and assumes an unfrozen initial state. + * + * @brief Create an empty executable. + * + * @param[in] profile Profile used in the executable. + * + * @param[in] executable_state Executable state. If the state is + * ::HSA_EXECUTABLE_STATE_FROZEN, the resulting executable is useless because no + * code objects can be loaded, and no variables can be defined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] executable Memory location where the HSA runtime stores the newly + * created executable handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or + * @p executable is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_create( + hsa_profile_t profile, + hsa_executable_state_t executable_state, + const char *options, + hsa_executable_t *executable); + +/** + * @brief Create an empty executable. + * + * @param[in] profile Profile used in the executable. + * + * @param[in] default_float_rounding_mode Default floating-point rounding mode + * used in the executable. Allowed rounding modes are near and zero (default is + * not allowed). + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] executable Memory location where the HSA runtime stores newly + * created executable handle. The initial state of the executable is + * ::HSA_EXECUTABLE_STATE_UNFROZEN. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or + * @p executable is NULL. + */ +hsa_status_t HSA_API hsa_executable_create_alt( + hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, + hsa_executable_t *executable); + +/** + * @brief Destroy an executable. + * + * @details An executable handle becomes invalid after the executable has been + * destroyed. Code object handles that were loaded into this executable are + * still valid after the executable has been destroyed, and can be used as + * intended. Resources allocated outside and associated with this executable + * (such as external global or readonly variables) can be released after the + * executable has been destroyed. + * + * Executable should not be destroyed while kernels are in flight. + * + * @param[in] executable Executable. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + */ +hsa_status_t HSA_API hsa_executable_destroy( + hsa_executable_t executable); + +/** + * @brief Loaded code object handle. + */ +typedef struct hsa_loaded_code_object_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_loaded_code_object_t; + +/** + * @brief Load a program code object into an executable. + * + * @details A program code object contains information about resources that are + * accessible by all kernel agents that run the executable, and can be loaded + * at most once into an executable. + * + * If the program code object uses extensions, the implementation must support + * them for this operation to return successfully. + * + * @param[in] executable Executable. + * + * @param[in] code_object_reader A code object reader that holds the program + * code object to load. If a code object reader is destroyed before all the + * associated executables are destroyed, the behavior is undefined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] loaded_code_object Pointer to a memory location where the HSA + * runtime stores the loaded code object handle. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader + * is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The program code object is + * not compatible with the executable or the implementation (for example, the + * code object uses an extension that is not supported by the implementation). + */ +hsa_status_t HSA_API hsa_executable_load_program_code_object( + hsa_executable_t executable, + hsa_code_object_reader_t code_object_reader, + const char *options, + hsa_loaded_code_object_t *loaded_code_object); + +/** + * @brief Load an agent code object into an executable. + * + * @details The agent code object contains all defined agent + * allocation variables, functions, indirect functions, and kernels in a given + * program for a given instruction set architecture. + * + * Any module linkage declaration must have been defined either by a define + * variable or by loading a code object that has a symbol with module linkage + * definition. + * + * The default floating-point rounding mode of the code object associated with + * @p code_object_reader must match that of the executable + * (::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE), or be default (in which + * case the value of ::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE is used). + * If the agent code object uses extensions, the implementation and the agent + * must support them for this operation to return successfully. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent to load code object for. A code object can be loaded + * into an executable at most once for a given agent. The instruction set + * architecture of the code object must be supported by the agent. + * + * @param[in] code_object_reader A code object reader that holds the code object + * to load. If a code object reader is destroyed before all the associated + * executables are destroyed, the behavior is undefined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] loaded_code_object Pointer to a memory location where the HSA + * runtime stores the loaded code object handle. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader + * is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The code object read by @p + * code_object_reader is not compatible with the agent (for example, the agent + * does not support the instruction set architecture of the code object), the + * executable (for example, there is a default floating-point mode mismatch + * between the two), or the implementation. + */ +hsa_status_t HSA_API hsa_executable_load_agent_code_object( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_reader_t code_object_reader, + const char *options, + hsa_loaded_code_object_t *loaded_code_object); + +/** + * @brief Freeze the executable. + * + * @details No modifications to executable can be made after freezing: no code + * objects can be loaded to the executable, and no external variables can be + * defined. Freezing the executable does not prevent querying the executable's + * attributes. The application must define all the external variables in an + * executable before freezing it. + * + * @param[in] executable Executable. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_UNDEFINED One or more variables are + * undefined in the executable. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is already frozen. + */ +hsa_status_t HSA_API hsa_executable_freeze( + hsa_executable_t executable, + const char *options); + +/** + * @brief Executable attributes. + */ +typedef enum { + /** + * Profile this executable is created for. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_EXECUTABLE_INFO_PROFILE = 1, + /** + * Executable state. The type of this attribute is ::hsa_executable_state_t. + */ + HSA_EXECUTABLE_INFO_STATE = 2, + /** + * Default floating-point rounding mode specified when executable was created. + * The type of this attribute is ::hsa_default_float_rounding_mode_t. + */ + HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 3 +} hsa_executable_info_t; + +/** + * @brief Get the current value of an attribute for a given executable. + * + * @param[in] executable Executable. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * executable attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_executable_get_info( + hsa_executable_t executable, + hsa_executable_info_t attribute, + void *value); + +/** + * @brief Define an external global variable with program allocation. + * + * @details This function allows the application to provide the definition + * of a variable in the global segment memory with program allocation. The + * variable must be defined before loading a code object into an executable. + * In addition, code objects loaded must not define the variable. + * + * @param[in] executable Executable. Must not be in frozen state. + * + * @param[in] variable_name Name of the variable. The Programmer's Reference + * Manual describes the standard name mangling scheme. + * + * @param[in] address Address where the variable is defined. This address must + * be in global memory and can be read and written by any agent in the + * system. The application cannot deallocate the buffer pointed by @p address + * before @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + */ +hsa_status_t HSA_API hsa_executable_global_variable_define( + hsa_executable_t executable, + const char *variable_name, + void *address); + +/** + * @brief Define an external global variable with agent allocation. + * + * @details This function allows the application to provide the definition + * of a variable in the global segment memory with agent allocation. The + * variable must be defined before loading a code object into an executable. + * In addition, code objects loaded must not define the variable. + * + * @param[in] executable Executable. Must not be in frozen state. + * + * @param[in] agent Agent for which the variable is being defined. + * + * @param[in] variable_name Name of the variable. The Programmer's Reference + * Manual describes the standard name mangling scheme. + * + * @param[in] address Address where the variable is defined. This address must + * have been previously allocated using ::hsa_memory_allocate in a global region + * that is only visible to @p agent. The application cannot deallocate the + * buffer pointed by @p address before @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + */ +hsa_status_t HSA_API hsa_executable_agent_global_variable_define( + hsa_executable_t executable, + hsa_agent_t agent, + const char *variable_name, + void *address); + +/** + * @brief Define an external readonly variable. + * + * @details This function allows the application to provide the definition + * of a variable in the readonly segment memory. The variable must be defined + * before loading a code object into an executable. In addition, code objects + * loaded must not define the variable. + * + * @param[in] executable Executable. Must not be in frozen state. + * + * @param[in] agent Agent for which the variable is being defined. + * + * @param[in] variable_name Name of the variable. The Programmer's Reference + * Manual describes the standard name mangling scheme. + * + * @param[in] address Address where the variable is defined. This address must + * have been previously allocated using ::hsa_memory_allocate in a readonly + * region associated with @p agent. The application cannot deallocate the buffer + * pointed by @p address before @p executable is destroyed. + * + * @param[in] address Address where the variable is defined. The buffer pointed + * by @p address is owned by the application, and cannot be deallocated before + * @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE Executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + */ +hsa_status_t HSA_API hsa_executable_readonly_variable_define( + hsa_executable_t executable, + hsa_agent_t agent, + const char *variable_name, + void *address); + +/** + * @brief Validate an executable. Checks that all code objects have matching + * machine model, profile, and default floating-point rounding mode. Checks that + * all declarations have definitions. Checks declaration-definition + * compatibility (see the HSA Programming Reference Manual for compatibility + * rules). Invoking this function is equivalent to invoking + * ::hsa_executable_validate_alt with no options. + * + * @param[in] executable Executable. Must be in frozen state. + * + * @param[out] result Memory location where the HSA runtime stores the + * validation result. If the executable passes validation, the result is 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API hsa_executable_validate( + hsa_executable_t executable, + uint32_t *result); + +/** + * @brief Validate an executable. Checks that all code objects have matching + * machine model, profile, and default floating-point rounding mode. Checks that + * all declarations have definitions. Checks declaration-definition + * compatibility (see the HSA Programming Reference Manual for compatibility + * rules). + * + * @param[in] executable Executable. Must be in frozen state. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] result Memory location where the HSA runtime stores the + * validation result. If the executable passes validation, the result is 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API hsa_executable_validate_alt( + hsa_executable_t executable, + const char *options, + uint32_t *result); + +/** + * @brief Executable symbol handle. + * + * The lifetime of an executable object symbol matches that of the executable + * associated with it. An operation on a symbol whose associated executable has + * been destroyed results in undefined behavior. + */ +typedef struct hsa_executable_symbol_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_executable_symbol_t; + +/** + * @deprecated Use ::hsa_executable_get_symbol_by_name instead. + * + * @brief Get the symbol handle for a given a symbol name. + * + * @param[in] executable Executable. + * + * @param[in] module_name Module name. Must be NULL if the symbol has + * program linkage. + * + * @param[in] symbol_name Symbol name. + * + * @param[in] agent Agent associated with the symbol. If the symbol is + * independent of any agent (for example, a variable with program + * allocation), this argument is ignored. + * + * @param[in] call_convention Call convention associated with the symbol. If the + * symbol does not correspond to an indirect function, this argument is ignored. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_get_symbol( + hsa_executable_t executable, + const char *module_name, + const char *symbol_name, + hsa_agent_t agent, + int32_t call_convention, + hsa_executable_symbol_t *symbol); + +/** + * @brief Retrieve the symbol handle corresponding to a given a symbol name. + * + * @param[in] executable Executable. + * + * @param[in] symbol_name Symbol name. Must be a NUL-terminated character + * array. The Programmer's Reference Manual describes the standard name mangling + * scheme. + * + * @param[in] agent Pointer to the agent for which the symbol with the given + * name is defined. If the symbol corresponding to the given name has program + * allocation, @p agent must be NULL. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or @p + * symbol is NULL. + */ +hsa_status_t HSA_API hsa_executable_get_symbol_by_name( + hsa_executable_t executable, + const char *symbol_name, + const hsa_agent_t *agent, + hsa_executable_symbol_t *symbol); + +/** + * @brief Symbol type. + */ +typedef enum { + /** + * Variable. + */ + HSA_SYMBOL_KIND_VARIABLE = 0, + /** + * Kernel. + */ + HSA_SYMBOL_KIND_KERNEL = 1, + /** + * Indirect function. + */ + HSA_SYMBOL_KIND_INDIRECT_FUNCTION = 2 +} hsa_symbol_kind_t; + +/** + * @brief Linkage type of a symbol. + */ +typedef enum { + /** + * Module linkage. + */ + HSA_SYMBOL_LINKAGE_MODULE = 0, + /** + * Program linkage. + */ + HSA_SYMBOL_LINKAGE_PROGRAM = 1 +} hsa_symbol_linkage_t; + +/** + * @brief Allocation type of a variable. + */ +typedef enum { + /** + * Agent allocation. + */ + HSA_VARIABLE_ALLOCATION_AGENT = 0, + /** + * Program allocation. + */ + HSA_VARIABLE_ALLOCATION_PROGRAM = 1 +} hsa_variable_allocation_t; + +/** + * @brief Memory segment associated with a variable. + */ +typedef enum { + /** + * Global memory segment. + */ + HSA_VARIABLE_SEGMENT_GLOBAL = 0, + /** + * Readonly memory segment. + */ + HSA_VARIABLE_SEGMENT_READONLY = 1 +} hsa_variable_segment_t; + +/** + * @brief Executable symbol attributes. + */ +typedef enum { + /** + * The kind of the symbol. The type of this attribute is ::hsa_symbol_kind_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0, + /** + * The length of the symbol name in bytes, not including the NUL terminator. + * The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1, + /** + * The name of the symbol. The type of this attribute is character array with + * the length equal to the value of ::HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH + * attribute. + */ + HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2, + /** + * @deprecated + * + * The length of the module name in bytes (not including the NUL terminator) + * to which this symbol belongs if this symbol has module linkage, otherwise 0 + * is returned. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3, + /** + * @deprecated + * + * The module name to which this symbol belongs if this symbol has module + * linkage, otherwise an empty string is returned. The type of this attribute + * is character array with the length equal to the value of + * ::HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute. + */ + HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME = 4, + /** + * @deprecated + * + * Agent associated with this symbol. If the symbol is a variable, the + * value of this attribute is only defined if + * ::HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION is + * ::HSA_VARIABLE_ALLOCATION_AGENT. The type of this attribute is hsa_agent_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_AGENT = 20, + /** + * The address of the variable. The value of this attribute is undefined if + * the symbol is not a variable. The type of this attribute is uint64_t. + * + * If executable's state is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 is + * returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21, + /** + * The linkage kind of the symbol. The type of this attribute is + * ::hsa_symbol_linkage_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE = 5, + /** + * Indicates whether the symbol corresponds to a definition. The type of this + * attribute is bool. + */ + HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION = 17, + /** + * @deprecated + * + * The allocation kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_allocation_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6, + /** + * @deprecated + * + * The segment kind of the variable. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_segment_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT = 7, + /** + * @deprecated + * + * Alignment of the symbol in memory. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is uint32_t. + * + * The current alignment of the variable in memory may be greater than the + * value specified in the source program variable declaration. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8, + /** + * @deprecated + * + * Size of the variable. The value of this attribute is undefined if + * the symbol is not a variable. The type of this attribute is uint32_t. + * + * A value of 0 is returned if the variable is an external variable and has an + * unknown dimension. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9, + /** + * @deprecated + * + * Indicates whether the variable is constant. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * bool. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST = 10, + /** + * Kernel object handle, used in the kernel dispatch packet. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint64_t. + * + * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 + * is returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22, + /** + * Size of kernarg segment memory that is required to hold the values of the + * kernel arguments, in bytes. Must be a multiple of 16. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, + /** + * Alignment (in bytes) of the buffer used to pass arguments to the kernel, + * which is the maximum of 16 and the maximum alignment of any of the kernel + * arguments. The value of this attribute is undefined if the symbol is not a + * kernel. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12, + /** + * Size of static group segment memory required by the kernel (per + * work-group), in bytes. The value of this attribute is undefined + * if the symbol is not a kernel. The type of this attribute is uint32_t. + * + * The reported amount does not include any dynamically allocated group + * segment memory that may be requested by the application when a kernel is + * dispatched. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, + /** + * Size of static private, spill, and arg segment memory required by + * this kernel (per work-item), in bytes. The value of this attribute is + * undefined if the symbol is not a kernel. The type of this attribute is + * uint32_t. + * + * If the value of ::HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is + * true, the kernel may use more private memory than the reported value, and + * the application must add the dynamic call stack usage to @a + * private_segment_size when populating a kernel dispatch packet. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, + /** + * Dynamic callstack flag. The value of this attribute is undefined if the + * symbol is not a kernel. The type of this attribute is bool. + * + * If this flag is set (the value is true), the kernel uses a dynamically + * sized call stack. This can happen if recursive calls, calls to indirect + * functions, or the HSAIL alloca instruction are present in the kernel. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, + /** + * @deprecated + * + * Call convention of the kernel. The value of this attribute is undefined if + * the symbol is not a kernel. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18, + /** + * Indirect function object handle. The value of this attribute is undefined + * if the symbol is not an indirect function, or the associated agent does + * not support the Full Profile. The type of this attribute depends on the + * machine model: the type is uint32_t for small machine model, and uint64_t + * for large model. + * + * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 + * is returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_OBJECT = 23, + /** + * @deprecated + * + * Call convention of the indirect function. The value of this attribute is + * undefined if the symbol is not an indirect function, or the associated + * agent does not support the Full Profile. The type of this attribute is + * uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16 +} hsa_executable_symbol_info_t; + +/** + * @brief Get the current value of an attribute for a given executable symbol. + * + * @param[in] executable_symbol Executable symbol. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL The executable symbol is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * executable symbol attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_executable_symbol_get_info( + hsa_executable_symbol_t executable_symbol, + hsa_executable_symbol_info_t attribute, + void *value); + +/** + * @deprecated + * + * @brief Iterate over the symbols in a executable, and invoke an + * application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_iterate_symbols( + hsa_executable_t executable, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_executable_symbol_t symbol, + void *data), + void *data); + +/** + * @brief Iterate over the kernels, indirect functions, and agent allocation + * variables in an executable for a given agent, and invoke an application- + * defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_executable_iterate_agent_symbols( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_agent_t agent, + hsa_executable_symbol_t symbol, + void *data), + void *data); + +/** + * @brief Iterate over the program allocation variables in an executable, and + * invoke an application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_executable_iterate_program_symbols( + hsa_executable_t executable, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_executable_symbol_t symbol, + void *data), + void *data); + +/** @} */ + + +/** \defgroup code-object Code Objects (deprecated). + * @{ + */ + +/** + * @deprecated + * + * @brief Struct containing an opaque handle to a code object, which contains + * ISA for finalized kernels and indirect functions together with information + * about the global or readonly segment variables they reference. + */ +typedef struct hsa_code_object_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_code_object_t; + +/** + * @deprecated + * + * @brief Application data handle that is passed to the serialization + * and deserialization functions. + */ +typedef struct hsa_callback_data_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_callback_data_t; + +/** + * @deprecated + * + * @brief Serialize a code object. Can be used for offline finalization, + * install-time finalization, disk code caching, etc. + * + * @param[in] code_object Code object. + * + * @param[in] alloc_callback Callback function for memory allocation. Must not + * be NULL. The HSA runtime passes three arguments to the callback: the + * allocation size, the application data, and a pointer to a memory location + * where the application stores the allocation result. The HSA runtime invokes + * @p alloc_callback once to allocate a buffer that contains the serialized + * version of @p code_object. If the callback returns a status code other than + * ::HSA_STATUS_SUCCESS, this function returns the same code. + * + * @param[in] callback_data Application data that is passed to @p + * alloc_callback. May be NULL. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] serialized_code_object Memory location where the HSA runtime + * stores a pointer to the serialized code object. Must not be NULL. + * + * @param[out] serialized_code_object_size Memory location where the HSA runtime + * stores the size (in bytes) of @p serialized_code_object. The returned value + * matches the allocation size passed by the HSA runtime to @p + * alloc_callback. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p alloc_callback, @p + * serialized_code_object, or @p serialized_code_object_size are NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_serialize( + hsa_code_object_t code_object, + hsa_status_t (*alloc_callback)(size_t size, + hsa_callback_data_t data, + void **address), + hsa_callback_data_t callback_data, + const char *options, + void **serialized_code_object, + size_t *serialized_code_object_size); + +/** + * @deprecated + * + * @brief Deserialize a code object. + * + * @param[in] serialized_code_object A serialized code object. Must not be NULL. + * + * @param[in] serialized_code_object_size The size (in bytes) of @p + * serialized_code_object. Must not be 0. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] code_object Memory location where the HSA runtime stores the + * deserialized code object. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p serialized_code_object, or @p + * code_object are NULL, or @p serialized_code_object_size is 0. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_deserialize( + void *serialized_code_object, + size_t serialized_code_object_size, + const char *options, + hsa_code_object_t *code_object); + +/** + * @deprecated + * + * @brief Destroy a code object. + * + * @details The lifetime of a code object must exceed that of any executable + * where it has been loaded. If an executable that loaded @p code_object has not + * been destroyed, the behavior is undefined. + * + * @param[in] code_object Code object. The handle becomes invalid after it has + * been destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_destroy( + hsa_code_object_t code_object); + +/** + * @deprecated + * + * @brief Code object type. + */ +typedef enum { + /** + * Produces code object that contains ISA for all kernels and indirect + * functions in HSA source. + */ + HSA_CODE_OBJECT_TYPE_PROGRAM = 0 +} hsa_code_object_type_t; + +/** + * @deprecated + * + * @brief Code object attributes. + */ +typedef enum { + /** + * The version of the code object. The type of this attribute is a + * NUL-terminated char[64]. The name must be at most 63 characters long (not + * including the NUL terminator) and all array elements not used for the name + * must be NUL. + */ + HSA_CODE_OBJECT_INFO_VERSION = 0, + /** + * Type of code object. The type of this attribute is + * ::hsa_code_object_type_t. + */ + HSA_CODE_OBJECT_INFO_TYPE = 1, + /** + * Instruction set architecture this code object is produced for. The type of + * this attribute is ::hsa_isa_t. + */ + HSA_CODE_OBJECT_INFO_ISA = 2, + /** + * Machine model this code object is produced for. The type of this attribute + * is ::hsa_machine_model_t. + */ + HSA_CODE_OBJECT_INFO_MACHINE_MODEL = 3, + /** + * Profile this code object is produced for. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_CODE_OBJECT_INFO_PROFILE = 4, + /** + * Default floating-point rounding mode used when the code object is + * produced. The type of this attribute is + * ::hsa_default_float_rounding_mode_t. + */ + HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5 +} hsa_code_object_info_t; + +/** + * @deprecated + * + * @brief Get the current value of an attribute for a given code object. + * + * @param[in] code_object Code object. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * code object attribute, or @p value is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_info( + hsa_code_object_t code_object, + hsa_code_object_info_t attribute, + void *value); + +/** + * @deprecated + * + * @brief Load code object into the executable. + * + * @details Every global or readonly variable that is external must be defined + * before loading the code object. An internal global or readonly variable is + * allocated once the code object, that is being loaded, references this + * variable and this variable is not allocated. + * + * Any module linkage declaration must have been defined either by a define + * variable or by loading a code object that has a symbol with module linkage + * definition. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent to load code object for. The agent must support the + * default floating-point rounding mode used by @p code_object. + * + * @param[in] code_object Code object to load. The lifetime of the code object + * must exceed that of the executable: if @p code_object is destroyed before @p + * executable, the behavior is undefined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p agent is not compatible + * with @p code_object (for example, @p agent does not support the default + * floating-point rounding mode specified by @p code_object), or @p code_object + * is not compatible with @p executable (for example, @p code_object and @p + * executable have different machine models or profiles). + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_load_code_object( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options); + +/** + * @deprecated + * + * @brief Code object symbol handle. + * + * The lifetime of a code object symbol matches that of the code object + * associated with it. An operation on a symbol whose associated code object has + * been destroyed results in undefined behavior. + */ +typedef struct hsa_code_symbol_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_code_symbol_t; + +/** + * @deprecated + * + * @brief Get the symbol handle within a code object for a given a symbol name. + * + * @param[in] code_object Code object. + * + * @param[in] symbol_name Symbol name. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol( + hsa_code_object_t code_object, + const char *symbol_name, + hsa_code_symbol_t *symbol); + +/** + * @deprecated + * + * @brief Get the symbol handle within a code object for a given a symbol name. + * + * @param[in] code_object Code object. + * + * @param[in] module_name Module name. Must be NULL if the symbol has + * program linkage. + * + * @param[in] symbol_name Symbol name. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol_from_name( + hsa_code_object_t code_object, + const char *module_name, + const char *symbol_name, + hsa_code_symbol_t *symbol); + +/** + * @deprecated + * + * @brief Code object symbol attributes. + */ +typedef enum { + /** + * The type of the symbol. The type of this attribute is ::hsa_symbol_kind_t. + */ + HSA_CODE_SYMBOL_INFO_TYPE = 0, + /** + * The length of the symbol name in bytes, not including the NUL terminator. + * The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_NAME_LENGTH = 1, + /** + * The name of the symbol. The type of this attribute is character array with + * the length equal to the value of ::HSA_CODE_SYMBOL_INFO_NAME_LENGTH + * attribute. + */ + HSA_CODE_SYMBOL_INFO_NAME = 2, + /** + * The length of the module name in bytes (not including the NUL terminator) + * to which this symbol belongs if this symbol has module linkage, otherwise 0 + * is returned. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3, + /** + * The module name to which this symbol belongs if this symbol has module + * linkage, otherwise an empty string is returned. The type of this attribute + * is character array with the length equal to the value of + * ::HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute. + */ + HSA_CODE_SYMBOL_INFO_MODULE_NAME = 4, + /** + * The linkage kind of the symbol. The type of this attribute is + * ::hsa_symbol_linkage_t. + */ + HSA_CODE_SYMBOL_INFO_LINKAGE = 5, + /** + * Indicates whether the symbol corresponds to a definition. The type of this + * attribute is bool. + */ + HSA_CODE_SYMBOL_INFO_IS_DEFINITION = 17, + /** + * The allocation kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_allocation_t. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6, + /** + * The segment kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_segment_t. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT = 7, + /** + * Alignment of the symbol in memory. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is uint32_t. + * + * The current alignment of the variable in memory may be greater than the + * value specified in the source program variable declaration. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8, + /** + * Size of the variable. The value of this attribute is undefined if the + * symbol is not a variable. The type of this attribute is uint32_t. + * + * A size of 0 is returned if the variable is an external variable and has an + * unknown dimension. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE = 9, + /** + * Indicates whether the variable is constant. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * bool. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST = 10, + /** + * Size of kernarg segment memory that is required to hold the values of the + * kernel arguments, in bytes. Must be a multiple of 16. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, + /** + * Alignment (in bytes) of the buffer used to pass arguments to the kernel, + * which is the maximum of 16 and the maximum alignment of any of the kernel + * arguments. The value of this attribute is undefined if the symbol is not a + * kernel. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12, + /** + * Size of static group segment memory required by the kernel (per + * work-group), in bytes. The value of this attribute is undefined + * if the symbol is not a kernel. The type of this attribute is uint32_t. + * + * The reported amount does not include any dynamically allocated group + * segment memory that may be requested by the application when a kernel is + * dispatched. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, + /** + * Size of static private, spill, and arg segment memory required by + * this kernel (per work-item), in bytes. The value of this attribute is + * undefined if the symbol is not a kernel. The type of this attribute is + * uint32_t. + * + * If the value of ::HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is true, + * the kernel may use more private memory than the reported value, and the + * application must add the dynamic call stack usage to @a + * private_segment_size when populating a kernel dispatch packet. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, + /** + * Dynamic callstack flag. The value of this attribute is undefined if the + * symbol is not a kernel. The type of this attribute is bool. + * + * If this flag is set (the value is true), the kernel uses a dynamically + * sized call stack. This can happen if recursive calls, calls to indirect + * functions, or the HSAIL alloca instruction are present in the kernel. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, + /** + * Call convention of the kernel. The value of this attribute is undefined if + * the symbol is not a kernel. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18, + /** + * Call convention of the indirect function. The value of this attribute is + * undefined if the symbol is not an indirect function. The type of this + * attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16, + /** + * Wavefront size used by the kernel. The value of this attribute is either + * 32 or 64. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE = 19 +} hsa_code_symbol_info_t; + +/** + * @deprecated + * + * @brief Get the current value of an attribute for a given code symbol. + * + * @param[in] code_symbol Code symbol. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_SYMBOL The code symbol is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * code symbol attribute, or @p value is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_symbol_get_info( + hsa_code_symbol_t code_symbol, + hsa_code_symbol_info_t attribute, + void *value); + +/** + * @deprecated + * + * @brief Iterate over the symbols in a code object, and invoke an + * application-defined callback on every iteration. + * + * @param[in] code_object Code object. + * + * @param[in] callback Callback to be invoked once per code object symbol. The + * HSA runtime passes three arguments to the callback: the code object, a + * symbol, and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_code_object_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_iterate_symbols( + hsa_code_object_t code_object, + hsa_status_t (*callback)(hsa_code_object_t code_object, + hsa_code_symbol_t symbol, + void *data), + void *data); + +/** @} */ + +#ifdef __cplusplus +} // end extern "C" block +#endif + +#endif // header guard diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_amd_tool.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_amd_tool.h new file mode 100644 index 0000000000..22847a8a44 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_amd_tool.h @@ -0,0 +1,97 @@ +/* + * Copyright © Advanced Micro Devices, Inc., or its affiliates. + * + * SPDX-License-Identifier: MIT + */ + +#ifndef HSA_RUNTIME_AMD_TOOL_EVENTS_H_ +#define HSA_RUNTIME_AMD_TOOL_EVENTS_H_ + +// Insert license header + +#include +#include +#include "hsa.h" + + +typedef enum { + HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE = 0, + HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_USE_ONCE = + (1 << 0), // This scratch allocation is only valid for 1 dispatch. + HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_ALT = + (1 << 1), // Used alternate scratch instead of main scratch +} hsa_amd_event_scratch_alloc_flag_t; + +typedef enum { + HSA_AMD_TOOL_EVENT_MIN = 0, + + // Scratch memory tracking + HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_START, + HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_END, + HSA_AMD_TOOL_EVENT_SCRATCH_FREE_START, + HSA_AMD_TOOL_EVENT_SCRATCH_FREE_END, + HSA_AMD_TOOL_EVENT_SCRATCH_ASYNC_RECLAIM_START, + HSA_AMD_TOOL_EVENT_SCRATCH_ASYNC_RECLAIM_END, + + // Add new events above ^ + HSA_AMD_TOOL_EVENT_MAX +} hsa_amd_tool_event_kind_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; +} hsa_amd_tool_event_none_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; + uint64_t dispatch_id; // Dispatch ID of the AQL packet that needs more scratch memory +} hsa_amd_event_scratch_alloc_start_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; + uint64_t dispatch_id; // Dispatch ID of the AQL packet that needs more scratch memory + size_t size; // Amount of scratch allocated - in bytes + size_t num_slots; // limit of number of waves +} hsa_amd_event_scratch_alloc_end_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; +} hsa_amd_event_scratch_free_start_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; +} hsa_amd_event_scratch_free_end_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; +} hsa_amd_event_scratch_async_reclaim_start_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; +} hsa_amd_event_scratch_async_reclaim_end_t; + +typedef union { + const hsa_amd_tool_event_none_t* none; + const hsa_amd_event_scratch_alloc_start_t* scratch_alloc_start; + const hsa_amd_event_scratch_alloc_end_t* scratch_alloc_end; + const hsa_amd_event_scratch_free_start_t* scratch_free_start; + const hsa_amd_event_scratch_free_end_t* scratch_free_end; + const hsa_amd_event_scratch_async_reclaim_start_t* scratch_async_reclaim_start; + const hsa_amd_event_scratch_async_reclaim_end_t* scratch_async_reclaim_end; +} hsa_amd_tool_event_t; + +typedef hsa_status_t (*hsa_amd_tool_event)(hsa_amd_tool_event_t); + + +#endif \ No newline at end of file diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace.h new file mode 100644 index 0000000000..cc33320269 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace.h @@ -0,0 +1,587 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_H +#define HSA_RUNTIME_INC_HSA_API_TRACE_H + +#include "hsa.h" +#include "hsa_api_trace_version.h" +#ifdef AMD_INTERNAL_BUILD +#include "hsa_ext_image.h" +#include "hsa_ext_amd.h" +#include "hsa_ext_finalize.h" +#include "hsa_amd_tool.h" +#include "hsa_ven_amd_pc_sampling.h" +#else +#include "inc/hsa_ext_image.h" +#include "inc/hsa_ext_amd.h" +#include "inc/hsa_ext_finalize.h" +#include "inc/hsa_amd_tool.h" +#include "inc/hsa_ven_amd_pc_sampling.h" +#endif + +#include +#include +#include + +// Table MAJOR_VERSION and STEP_VERSION defines have moved to hsa_api_trace_version.h + +// Min function used to copy Api Tables +static inline uint32_t Min(const uint32_t a, const uint32_t b) { + return (a > b) ? b : a; +} + +// Declarations of APIs intended for use only by tools. + +// An AQL packet that can be put in an intercept queue to cause a callback to +// be invoked when the packet is about to be submitted to the underlying +// hardware queue. These packets are not copied to the underlying hardware +// queue. These packets should come immediately before the regular AQL packet +// they relate to. This implies that packet rewriters should always keep these +// packets adjacent to the regular AQL packet that follows them. +const uint32_t AMD_AQL_FORMAT_INTERCEPT_MARKER = 0xFE; + +struct amd_aql_intercept_marker_s; + +// When an intercept queue is processing rewritten packets to put them on the +// underlying hardware queue, if it encounters a +// AMD_AQL_FORMAT_INTERCEPT_MARKER vendor AQL packet it will call the following +// handler. packet points to the packet, queue is the underlying hardware +// queue, and packet_id is the packet id of the next packet to be put on the +// underlying hardware queue. The intercept queue does not put these packets +// onto the underlying hardware queue. +typedef void (*amd_intercept_marker_handler)(const struct amd_aql_intercept_marker_s* packet, + hsa_queue_t* queue, uint64_t packet_id); +// An AQL vendor packet used by the intercept queue to mark the following +// packet. The callback will be invoked to allow a tool to know where in the +// underlying hardware queue the following packet will be placed. user_data can +// be used to hold any data useful to the tool. +typedef struct amd_aql_intercept_marker_s { + uint16_t header; // Must have a packet type of HSA_PACKET_TYPE_VENDOR_SPECIFIC. + uint8_t format; // Must be AMD_AQL_FORMAT_INTERCEPT_MARKER. + uint8_t reserved[5]; // Must be 0. +#ifdef HSA_LARGE_MODEL + amd_intercept_marker_handler callback; +#elif defined HSA_LITTLE_ENDIAN + amd_intercept_marker_handler callback; + uint32_t reserved1; // Must be 0. +#else + uint32_t reserved1; // Must be 0. + amd_intercept_marker_handler callback; +#endif + uint64_t user_data[6]; +} amd_aql_intercept_marker_t; + +typedef void (*hsa_amd_queue_intercept_packet_writer)(const void* pkts, uint64_t pkt_count); +typedef void (*hsa_amd_queue_intercept_handler)(const void* pkts, uint64_t pkt_count, + uint64_t user_pkt_index, void* data, + hsa_amd_queue_intercept_packet_writer writer); +hsa_status_t hsa_amd_queue_intercept_register(hsa_queue_t* queue, + hsa_amd_queue_intercept_handler callback, + void* user_data); +hsa_status_t hsa_amd_queue_intercept_create( + hsa_agent_t agent_handle, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data, + uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue); + +typedef void (*hsa_amd_runtime_queue_notifier)(const hsa_queue_t* queue, hsa_agent_t agent, + void* data); +hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifier callback, + void* user_data); + +// Structure of Version used to identify an instance of Api table +// Must be the first member (offsetof == 0) of all API tables. +// This is the root of the table passing ABI. +struct ApiTableVersion { + uint32_t major_id; + uint32_t minor_id; + uint32_t step_id; + uint32_t reserved; +}; + +struct ToolsApiTable { + ApiTableVersion version; + + hsa_amd_tool_event hsa_amd_tool_scratch_event_alloc_start_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_alloc_end_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_free_start_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_free_end_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_async_reclaim_start_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_async_reclaim_end_fn; +}; + +// Table to export HSA Finalizer Extension Apis +struct FinalizerExtTable { + ApiTableVersion version; + decltype(hsa_ext_program_create)* hsa_ext_program_create_fn; + decltype(hsa_ext_program_destroy)* hsa_ext_program_destroy_fn; + decltype(hsa_ext_program_add_module)* hsa_ext_program_add_module_fn; + decltype(hsa_ext_program_iterate_modules)* hsa_ext_program_iterate_modules_fn; + decltype(hsa_ext_program_get_info)* hsa_ext_program_get_info_fn; + decltype(hsa_ext_program_finalize)* hsa_ext_program_finalize_fn; +}; + +// Table to export HSA Image Extension Apis +struct ImageExtTable { + ApiTableVersion version; + decltype(hsa_ext_image_get_capability)* hsa_ext_image_get_capability_fn; + decltype(hsa_ext_image_data_get_info)* hsa_ext_image_data_get_info_fn; + decltype(hsa_ext_image_create)* hsa_ext_image_create_fn; + decltype(hsa_ext_image_import)* hsa_ext_image_import_fn; + decltype(hsa_ext_image_export)* hsa_ext_image_export_fn; + decltype(hsa_ext_image_copy)* hsa_ext_image_copy_fn; + decltype(hsa_ext_image_clear)* hsa_ext_image_clear_fn; + decltype(hsa_ext_image_destroy)* hsa_ext_image_destroy_fn; + decltype(hsa_ext_sampler_create)* hsa_ext_sampler_create_fn; + decltype(hsa_ext_sampler_destroy)* hsa_ext_sampler_destroy_fn; + decltype(hsa_ext_image_get_capability_with_layout)* hsa_ext_image_get_capability_with_layout_fn; + decltype(hsa_ext_image_data_get_info_with_layout)* hsa_ext_image_data_get_info_with_layout_fn; + decltype(hsa_ext_image_create_with_layout)* hsa_ext_image_create_with_layout_fn; + decltype(hsa_ext_sampler_create_v2)* hsa_ext_sampler_create_v2_fn; + +}; + +// Table to export HSA PC Sampling Extension Apis +struct PcSamplingExtTable { + ApiTableVersion version; + decltype(hsa_ven_amd_pcs_iterate_configuration)* hsa_ven_amd_pcs_iterate_configuration_fn; + decltype(hsa_ven_amd_pcs_create)* hsa_ven_amd_pcs_create_fn; + decltype(hsa_ven_amd_pcs_create_from_id)* hsa_ven_amd_pcs_create_from_id_fn; + decltype(hsa_ven_amd_pcs_destroy)* hsa_ven_amd_pcs_destroy_fn; + decltype(hsa_ven_amd_pcs_start)* hsa_ven_amd_pcs_start_fn; + decltype(hsa_ven_amd_pcs_stop)* hsa_ven_amd_pcs_stop_fn; + decltype(hsa_ven_amd_pcs_flush)* hsa_ven_amd_pcs_flush_fn; +}; + + +// Table to export AMD Extension Apis +struct AmdExtTable { + ApiTableVersion version; + decltype(hsa_amd_coherency_get_type)* hsa_amd_coherency_get_type_fn; + decltype(hsa_amd_coherency_set_type)* hsa_amd_coherency_set_type_fn; + decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled_fn; + decltype(hsa_amd_profiling_async_copy_enable) *hsa_amd_profiling_async_copy_enable_fn; + decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time_fn; + decltype(hsa_amd_profiling_get_async_copy_time) *hsa_amd_profiling_get_async_copy_time_fn; + decltype(hsa_amd_profiling_convert_tick_to_system_domain)* hsa_amd_profiling_convert_tick_to_system_domain_fn; + decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler_fn; + decltype(hsa_amd_async_function)* hsa_amd_async_function_fn; + decltype(hsa_amd_signal_wait_any)* hsa_amd_signal_wait_any_fn; + decltype(hsa_amd_queue_cu_set_mask)* hsa_amd_queue_cu_set_mask_fn; + decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn; + decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn; + decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn; + decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; + decltype(hsa_amd_memory_async_copy_on_engine)* hsa_amd_memory_async_copy_on_engine_fn; + decltype(hsa_amd_memory_copy_engine_status)* hsa_amd_memory_copy_engine_status_fn; + decltype(hsa_amd_agent_memory_pool_get_info)* hsa_amd_agent_memory_pool_get_info_fn; + decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn; + decltype(hsa_amd_memory_pool_can_migrate)* hsa_amd_memory_pool_can_migrate_fn; + decltype(hsa_amd_memory_migrate)* hsa_amd_memory_migrate_fn; + decltype(hsa_amd_memory_lock)* hsa_amd_memory_lock_fn; + decltype(hsa_amd_memory_unlock)* hsa_amd_memory_unlock_fn; + decltype(hsa_amd_memory_fill)* hsa_amd_memory_fill_fn; + decltype(hsa_amd_interop_map_buffer)* hsa_amd_interop_map_buffer_fn; + decltype(hsa_amd_interop_unmap_buffer)* hsa_amd_interop_unmap_buffer_fn; + decltype(hsa_amd_image_create)* hsa_amd_image_create_fn; + decltype(hsa_amd_pointer_info)* hsa_amd_pointer_info_fn; + decltype(hsa_amd_pointer_info_set_userdata)* hsa_amd_pointer_info_set_userdata_fn; + decltype(hsa_amd_ipc_memory_create)* hsa_amd_ipc_memory_create_fn; + decltype(hsa_amd_ipc_memory_attach)* hsa_amd_ipc_memory_attach_fn; + decltype(hsa_amd_ipc_memory_detach)* hsa_amd_ipc_memory_detach_fn; + decltype(hsa_amd_signal_create)* hsa_amd_signal_create_fn; + decltype(hsa_amd_ipc_signal_create)* hsa_amd_ipc_signal_create_fn; + decltype(hsa_amd_ipc_signal_attach)* hsa_amd_ipc_signal_attach_fn; + decltype(hsa_amd_register_system_event_handler)* hsa_amd_register_system_event_handler_fn; + decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; + decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; + decltype(hsa_amd_queue_set_priority)* hsa_amd_queue_set_priority_fn; + decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; + decltype(hsa_amd_runtime_queue_create_register)* hsa_amd_runtime_queue_create_register_fn; + decltype(hsa_amd_memory_lock_to_pool)* hsa_amd_memory_lock_to_pool_fn; + decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn; + decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn; + decltype(hsa_amd_signal_value_pointer)* hsa_amd_signal_value_pointer_fn; + decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn; + decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn; + decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn; + decltype(hsa_amd_spm_acquire)* hsa_amd_spm_acquire_fn; + decltype(hsa_amd_spm_release)* hsa_amd_spm_release_fn; + decltype(hsa_amd_spm_set_dest_buffer)* hsa_amd_spm_set_dest_buffer_fn; + decltype(hsa_amd_queue_cu_get_mask)* hsa_amd_queue_cu_get_mask_fn; + decltype(hsa_amd_portable_export_dmabuf)* hsa_amd_portable_export_dmabuf_fn; + decltype(hsa_amd_portable_close_dmabuf)* hsa_amd_portable_close_dmabuf_fn; + decltype(hsa_amd_vmem_address_reserve)* hsa_amd_vmem_address_reserve_fn; + decltype(hsa_amd_vmem_address_free)* hsa_amd_vmem_address_free_fn; + decltype(hsa_amd_vmem_handle_create)* hsa_amd_vmem_handle_create_fn; + decltype(hsa_amd_vmem_handle_release)* hsa_amd_vmem_handle_release_fn; + decltype(hsa_amd_vmem_map)* hsa_amd_vmem_map_fn; + decltype(hsa_amd_vmem_unmap)* hsa_amd_vmem_unmap_fn; + decltype(hsa_amd_vmem_set_access)* hsa_amd_vmem_set_access_fn; + decltype(hsa_amd_vmem_get_access)* hsa_amd_vmem_get_access_fn; + decltype(hsa_amd_vmem_export_shareable_handle)* hsa_amd_vmem_export_shareable_handle_fn; + decltype(hsa_amd_vmem_import_shareable_handle)* hsa_amd_vmem_import_shareable_handle_fn; + decltype(hsa_amd_vmem_retain_alloc_handle)* hsa_amd_vmem_retain_alloc_handle_fn; + decltype(hsa_amd_vmem_get_alloc_properties_from_handle)* + hsa_amd_vmem_get_alloc_properties_from_handle_fn; + decltype(hsa_amd_agent_set_async_scratch_limit)* hsa_amd_agent_set_async_scratch_limit_fn; + decltype(hsa_amd_queue_get_info)* hsa_amd_queue_get_info_fn; + decltype(hsa_amd_vmem_address_reserve_align)* hsa_amd_vmem_address_reserve_align_fn; + decltype(hsa_amd_enable_logging)* hsa_amd_enable_logging_fn; + decltype(hsa_amd_signal_wait_all)* hsa_amd_signal_wait_all_fn; + decltype(hsa_amd_memory_get_preferred_copy_engine)* hsa_amd_memory_get_preferred_copy_engine_fn; + decltype(hsa_amd_portable_export_dmabuf_v2)* hsa_amd_portable_export_dmabuf_v2_fn; + decltype(hsa_amd_ais_file_write)* hsa_amd_ais_file_write_fn; + decltype(hsa_amd_ais_file_read)* hsa_amd_ais_file_read_fn; +}; + +// Table to export HSA Core Runtime Apis +struct CoreApiTable { + ApiTableVersion version; + decltype(hsa_init)* hsa_init_fn; + decltype(hsa_shut_down)* hsa_shut_down_fn; + decltype(hsa_system_get_info)* hsa_system_get_info_fn; + decltype(hsa_system_extension_supported)* hsa_system_extension_supported_fn; + decltype(hsa_system_get_extension_table)* hsa_system_get_extension_table_fn; + decltype(hsa_iterate_agents)* hsa_iterate_agents_fn; + decltype(hsa_agent_get_info)* hsa_agent_get_info_fn; + decltype(hsa_queue_create)* hsa_queue_create_fn; + decltype(hsa_soft_queue_create)* hsa_soft_queue_create_fn; + decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + decltype(hsa_queue_inactivate)* hsa_queue_inactivate_fn; + decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn; + decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn; + decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; + decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; + decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn; + decltype(hsa_queue_cas_write_index_scacq_screl)* hsa_queue_cas_write_index_scacq_screl_fn; + decltype(hsa_queue_cas_write_index_scacquire)* hsa_queue_cas_write_index_scacquire_fn; + decltype(hsa_queue_cas_write_index_relaxed)* hsa_queue_cas_write_index_relaxed_fn; + decltype(hsa_queue_cas_write_index_screlease)* hsa_queue_cas_write_index_screlease_fn; + decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl_fn; + decltype(hsa_queue_add_write_index_scacquire)* hsa_queue_add_write_index_scacquire_fn; + decltype(hsa_queue_add_write_index_relaxed)* hsa_queue_add_write_index_relaxed_fn; + decltype(hsa_queue_add_write_index_screlease)* hsa_queue_add_write_index_screlease_fn; + decltype(hsa_queue_store_read_index_relaxed)* hsa_queue_store_read_index_relaxed_fn; + decltype(hsa_queue_store_read_index_screlease)* hsa_queue_store_read_index_screlease_fn; + decltype(hsa_agent_iterate_regions)* hsa_agent_iterate_regions_fn; + decltype(hsa_region_get_info)* hsa_region_get_info_fn; + decltype(hsa_agent_get_exception_policies)* hsa_agent_get_exception_policies_fn; + decltype(hsa_agent_extension_supported)* hsa_agent_extension_supported_fn; + decltype(hsa_memory_register)* hsa_memory_register_fn; + decltype(hsa_memory_deregister)* hsa_memory_deregister_fn; + decltype(hsa_memory_allocate)* hsa_memory_allocate_fn; + decltype(hsa_memory_free)* hsa_memory_free_fn; + decltype(hsa_memory_copy)* hsa_memory_copy_fn; + decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn; + decltype(hsa_signal_create)* hsa_signal_create_fn; + decltype(hsa_signal_destroy)* hsa_signal_destroy_fn; + decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed_fn; + decltype(hsa_signal_load_scacquire)* hsa_signal_load_scacquire_fn; + decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn; + decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease_fn; + decltype(hsa_signal_wait_relaxed)* hsa_signal_wait_relaxed_fn; + decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire_fn; + decltype(hsa_signal_and_relaxed)* hsa_signal_and_relaxed_fn; + decltype(hsa_signal_and_scacquire)* hsa_signal_and_scacquire_fn; + decltype(hsa_signal_and_screlease)* hsa_signal_and_screlease_fn; + decltype(hsa_signal_and_scacq_screl)* hsa_signal_and_scacq_screl_fn; + decltype(hsa_signal_or_relaxed)* hsa_signal_or_relaxed_fn; + decltype(hsa_signal_or_scacquire)* hsa_signal_or_scacquire_fn; + decltype(hsa_signal_or_screlease)* hsa_signal_or_screlease_fn; + decltype(hsa_signal_or_scacq_screl)* hsa_signal_or_scacq_screl_fn; + decltype(hsa_signal_xor_relaxed)* hsa_signal_xor_relaxed_fn; + decltype(hsa_signal_xor_scacquire)* hsa_signal_xor_scacquire_fn; + decltype(hsa_signal_xor_screlease)* hsa_signal_xor_screlease_fn; + decltype(hsa_signal_xor_scacq_screl)* hsa_signal_xor_scacq_screl_fn; + decltype(hsa_signal_exchange_relaxed)* hsa_signal_exchange_relaxed_fn; + decltype(hsa_signal_exchange_scacquire)* hsa_signal_exchange_scacquire_fn; + decltype(hsa_signal_exchange_screlease)* hsa_signal_exchange_screlease_fn; + decltype(hsa_signal_exchange_scacq_screl)* hsa_signal_exchange_scacq_screl_fn; + decltype(hsa_signal_add_relaxed)* hsa_signal_add_relaxed_fn; + decltype(hsa_signal_add_scacquire)* hsa_signal_add_scacquire_fn; + decltype(hsa_signal_add_screlease)* hsa_signal_add_screlease_fn; + decltype(hsa_signal_add_scacq_screl)* hsa_signal_add_scacq_screl_fn; + decltype(hsa_signal_subtract_relaxed)* hsa_signal_subtract_relaxed_fn; + decltype(hsa_signal_subtract_scacquire)* hsa_signal_subtract_scacquire_fn; + decltype(hsa_signal_subtract_screlease)* hsa_signal_subtract_screlease_fn; + decltype(hsa_signal_subtract_scacq_screl)* hsa_signal_subtract_scacq_screl_fn; + decltype(hsa_signal_cas_relaxed)* hsa_signal_cas_relaxed_fn; + decltype(hsa_signal_cas_scacquire)* hsa_signal_cas_scacquire_fn; + decltype(hsa_signal_cas_screlease)* hsa_signal_cas_screlease_fn; + decltype(hsa_signal_cas_scacq_screl)* hsa_signal_cas_scacq_screl_fn; + + //===--- Instruction Set Architecture -----------------------------------===// + + decltype(hsa_isa_from_name)* hsa_isa_from_name_fn; + // Deprecated since v1.1. + decltype(hsa_isa_get_info)* hsa_isa_get_info_fn; + // Deprecated since v1.1. + decltype(hsa_isa_compatible)* hsa_isa_compatible_fn; + + //===--- Code Objects (deprecated) --------------------------------------===// + + // Deprecated since v1.1. + decltype(hsa_code_object_serialize)* hsa_code_object_serialize_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_deserialize)* hsa_code_object_deserialize_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_destroy)* hsa_code_object_destroy_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_get_info)* hsa_code_object_get_info_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_get_symbol)* hsa_code_object_get_symbol_fn; + // Deprecated since v1.1. + decltype(hsa_code_symbol_get_info)* hsa_code_symbol_get_info_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_iterate_symbols)* hsa_code_object_iterate_symbols_fn; + + //===--- Executable -----------------------------------------------------===// + + // Deprecated since v1.1. + decltype(hsa_executable_create)* hsa_executable_create_fn; + decltype(hsa_executable_destroy)* hsa_executable_destroy_fn; + // Deprecated since v1.1. + decltype(hsa_executable_load_code_object)* hsa_executable_load_code_object_fn; + decltype(hsa_executable_freeze)* hsa_executable_freeze_fn; + decltype(hsa_executable_get_info)* hsa_executable_get_info_fn; + decltype(hsa_executable_global_variable_define)* + hsa_executable_global_variable_define_fn; + decltype(hsa_executable_agent_global_variable_define)* + hsa_executable_agent_global_variable_define_fn; + decltype(hsa_executable_readonly_variable_define)* + hsa_executable_readonly_variable_define_fn; + decltype(hsa_executable_validate)* hsa_executable_validate_fn; + // Deprecated since v1.1. + decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol_fn; + decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info_fn; + // Deprecated since v1.1. + decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols_fn; + + //===--- Runtime Notifications ------------------------------------------===// + + decltype(hsa_status_string)* hsa_status_string_fn; + + // Start HSA v1.1 additions + decltype(hsa_extension_get_name)* hsa_extension_get_name_fn; + decltype(hsa_system_major_extension_supported)* hsa_system_major_extension_supported_fn; + decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table_fn; + decltype(hsa_agent_major_extension_supported)* hsa_agent_major_extension_supported_fn; + decltype(hsa_cache_get_info)* hsa_cache_get_info_fn; + decltype(hsa_agent_iterate_caches)* hsa_agent_iterate_caches_fn; + decltype(hsa_signal_silent_store_relaxed)* hsa_signal_silent_store_relaxed_fn; + decltype(hsa_signal_silent_store_screlease)* hsa_signal_silent_store_screlease_fn; + decltype(hsa_signal_group_create)* hsa_signal_group_create_fn; + decltype(hsa_signal_group_destroy)* hsa_signal_group_destroy_fn; + decltype(hsa_signal_group_wait_any_scacquire)* hsa_signal_group_wait_any_scacquire_fn; + decltype(hsa_signal_group_wait_any_relaxed)* hsa_signal_group_wait_any_relaxed_fn; + + //===--- Instruction Set Architecture - HSA v1.1 additions --------------===// + + decltype(hsa_agent_iterate_isas)* hsa_agent_iterate_isas_fn; + decltype(hsa_isa_get_info_alt)* hsa_isa_get_info_alt_fn; + decltype(hsa_isa_get_exception_policies)* hsa_isa_get_exception_policies_fn; + decltype(hsa_isa_get_round_method)* hsa_isa_get_round_method_fn; + decltype(hsa_wavefront_get_info)* hsa_wavefront_get_info_fn; + decltype(hsa_isa_iterate_wavefronts)* hsa_isa_iterate_wavefronts_fn; + + //===--- Code Objects (deprecated) - HSA v1.1 additions -----------------===// + + // Deprecated since v1.1. + decltype(hsa_code_object_get_symbol_from_name)* + hsa_code_object_get_symbol_from_name_fn; + + //===--- Executable - HSA v1.1 additions --------------------------------===// + + decltype(hsa_code_object_reader_create_from_file)* + hsa_code_object_reader_create_from_file_fn; + decltype(hsa_code_object_reader_create_from_memory)* + hsa_code_object_reader_create_from_memory_fn; + decltype(hsa_code_object_reader_destroy)* hsa_code_object_reader_destroy_fn; + decltype(hsa_executable_create_alt)* hsa_executable_create_alt_fn; + decltype(hsa_executable_load_program_code_object)* + hsa_executable_load_program_code_object_fn; + decltype(hsa_executable_load_agent_code_object)* + hsa_executable_load_agent_code_object_fn; + decltype(hsa_executable_validate_alt)* hsa_executable_validate_alt_fn; + decltype(hsa_executable_get_symbol_by_name)* + hsa_executable_get_symbol_by_name_fn; + decltype(hsa_executable_iterate_agent_symbols)* + hsa_executable_iterate_agent_symbols_fn; + decltype(hsa_executable_iterate_program_symbols)* + hsa_executable_iterate_program_symbols_fn; +}; + +// Table to export HSA Apis from Core Runtime, Amd Extensions +// Finalizer and Images +struct HsaApiTable { + + // Version of Hsa Api Table + ApiTableVersion version; + + // Table of function pointers to HSA Core Runtime + CoreApiTable* core_; + + // Table of function pointers to AMD extensions + AmdExtTable* amd_ext_; + + // Table of function pointers to HSA Finalizer Extension + FinalizerExtTable* finalizer_ext_; + + // Table of function pointers to HSA Image Extension + ImageExtTable* image_ext_; + + // Table of function pointers for tools to use + ToolsApiTable* tools_; + + // Table of function pointers to AMD PC Sampling Extension + PcSamplingExtTable* pc_sampling_ext_; +}; + +// Structure containing instances of different api tables +struct HsaApiTableContainer { + HsaApiTable root; + CoreApiTable core; + AmdExtTable amd_ext; + FinalizerExtTable finalizer_ext; + ImageExtTable image_ext; + ToolsApiTable tools; + PcSamplingExtTable pc_sampling_ext; + + // Default initialization of a container instance + HsaApiTableContainer() { + root.version.major_id = HSA_API_TABLE_MAJOR_VERSION; + root.version.minor_id = sizeof(HsaApiTable); + root.version.step_id = HSA_API_TABLE_STEP_VERSION; + + core.version.major_id = HSA_CORE_API_TABLE_MAJOR_VERSION; + core.version.minor_id = sizeof(CoreApiTable); + core.version.step_id = HSA_CORE_API_TABLE_STEP_VERSION; + root.core_ = &core; + + amd_ext.version.major_id = HSA_AMD_EXT_API_TABLE_MAJOR_VERSION; + amd_ext.version.minor_id = sizeof(AmdExtTable); + amd_ext.version.step_id = HSA_AMD_EXT_API_TABLE_STEP_VERSION; + root.amd_ext_ = &amd_ext; + + finalizer_ext.version.major_id = HSA_FINALIZER_API_TABLE_MAJOR_VERSION; + finalizer_ext.version.minor_id = sizeof(FinalizerExtTable); + finalizer_ext.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION; + root.finalizer_ext_ = &finalizer_ext; + + image_ext.version.major_id = HSA_IMAGE_API_TABLE_MAJOR_VERSION; + image_ext.version.minor_id = sizeof(ImageExtTable); + image_ext.version.step_id = HSA_IMAGE_API_TABLE_STEP_VERSION; + root.image_ext_ = &image_ext; + + tools.version.major_id = HSA_TOOLS_API_TABLE_MAJOR_VERSION; + tools.version.minor_id = sizeof(ToolsApiTable); + tools.version.step_id = HSA_TOOLS_API_TABLE_STEP_VERSION; + root.tools_ = &tools; + + pc_sampling_ext.version.major_id = HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION; + pc_sampling_ext.version.minor_id = sizeof(PcSamplingExtTable); + pc_sampling_ext.version.step_id = HSA_PC_SAMPLING_API_TABLE_STEP_VERSION; + root.pc_sampling_ext_ = &pc_sampling_ext; + } +}; + +// Api to copy function pointers of a table +static +void inline copyApi(void* src, void* dest, size_t size) { + assert(size >= sizeof(ApiTableVersion)); + memcpy((char*)src + sizeof(ApiTableVersion), + (char*)dest + sizeof(ApiTableVersion), + (size - sizeof(ApiTableVersion))); +} + +// Copy Api child tables if valid. +static void inline copyElement(ApiTableVersion* dest, ApiTableVersion* src) { + if (src->major_id && (dest->major_id == src->major_id)) { + dest->step_id = src->step_id; + dest->minor_id = Min(dest->minor_id, src->minor_id); + copyApi(dest, src, dest->minor_id); + } else { + dest->major_id = 0; + dest->minor_id = 0; + dest->step_id = 0; + } +} + +// Copy constructor for all Api tables. The function assumes the +// user has initialized an instance of tables container correctly +// for the Major, Minor and Stepping Ids of Root and Child Api tables. +// The function will overwrite the value of Minor Id by taking the +// minimum of source and destination parameters. It will also overwrite +// the stepping Id with value from source parameter. +static void inline copyTables(const HsaApiTable* src, HsaApiTable* dest) { + // Verify Major Id of source and destination tables match + if (dest->version.major_id != src->version.major_id) { + dest->version.major_id = 0; + dest->version.minor_id = 0; + dest->version.step_id = 0; + return; + } + + // Initialize the stepping id and minor id of root table. For the + // minor id which encodes struct size, take the minimum of source + // and destination parameters + dest->version.step_id = src->version.step_id; + dest->version.minor_id = Min(dest->version.minor_id, src->version.minor_id); + + // Copy child tables if present + if ((offsetof(HsaApiTable, core_) < dest->version.minor_id)) + copyElement(&dest->core_->version, &src->core_->version); + if ((offsetof(HsaApiTable, amd_ext_) < dest->version.minor_id)) + copyElement(&dest->amd_ext_->version, &src->amd_ext_->version); + if ((offsetof(HsaApiTable, finalizer_ext_) < dest->version.minor_id)) + copyElement(&dest->finalizer_ext_->version, &src->finalizer_ext_->version); + if ((offsetof(HsaApiTable, image_ext_) < dest->version.minor_id)) + copyElement(&dest->image_ext_->version, &src->image_ext_->version); + if ((offsetof(HsaApiTable, tools_) < dest->version.minor_id)) + copyElement(&dest->tools_->version, &src->tools_->version); + if ((offsetof(HsaApiTable, pc_sampling_ext_) < dest->version.minor_id)) + copyElement(&dest->pc_sampling_ext_->version, &src->pc_sampling_ext_->version); +} +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace_version.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace_version.h new file mode 100644 index 0000000000..6cf1054823 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace_version.h @@ -0,0 +1,70 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H +#define HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H + +// CODE IN THIS FILE **MUST** BE C-COMPATIBLE + +// Major Ids of the Api tables exported by Hsa Core Runtime +#define HSA_API_TABLE_MAJOR_VERSION 0x03 +#define HSA_CORE_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_IMAGE_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_TOOLS_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION 0x01 + +// Step Ids of the Api tables exported by Hsa Core Runtime +#define HSA_API_TABLE_STEP_VERSION 0x01 +#define HSA_CORE_API_TABLE_STEP_VERSION 0x00 +#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x08 +#define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00 +#define HSA_IMAGE_API_TABLE_STEP_VERSION 0x01 +// Rocprofiler just checks HSA_MAGE_EXT_API_TABLE_STEP_VERSION +#define HSA_IMAGE_EXT_API_TABLE_STEP_VERSION HSA_IMAGE_API_TABLE_STEP_VERSION +#define HSA_AQLPROFILE_API_TABLE_STEP_VERSION 0x00 +#define HSA_TOOLS_API_TABLE_STEP_VERSION 0x00 +#define HSA_PC_SAMPLING_API_TABLE_STEP_VERSION 0x00 + +#endif // HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_amd.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_amd.h new file mode 100644 index 0000000000..3fd1f9348e --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_amd.h @@ -0,0 +1,3782 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA AMD extension. + +#ifndef HSA_RUNTIME_EXT_AMD_H_ +#define HSA_RUNTIME_EXT_AMD_H_ + +#include "hsa.h" +#include "hsa_ext_image.h" +#include "hsa_ven_amd_pc_sampling.h" + +/** + * - 1.0 - initial version + * - 1.1 - dmabuf export + * - 1.2 - hsa_amd_memory_async_copy_on_engine + * - 1.3 - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED pool + * - 1.4 - Virtual Memory API + * - 1.5 - hsa_amd_agent_info: HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES + * - 1.6 - Virtual Memory API: hsa_amd_vmem_address_reserve_align + * - 1.7 - hsa_amd_signal_wait_all + * - 1.8 - hsa_amd_memory_get_preferred_copy_engine + * - 1.9 - hsa_amd_portable_export_dmabuf_v2 + * - 1.10 - hsa_amd_vmem_address_reserve: HSA_AMD_VMEM_ADDRESS_NO_REGISTER + * - 1.11 - hsa_amd_agent_info_t: HSA_AMD_AGENT_INFO_CLOCK_COUNTERS + * - 1.12 - hsa_amd_pointer_info: HSA_EXT_POINTER_TYPE_HSA_VMEM and HSA_EXT_POINTER_TYPE_RESERVED_ADDR + * - 1.13 - hsa_amd_pointer_info: Added new registered field to hsa_amd_pointer_info_t + * - 1.14 - hsa_amd_ais_file_write, hsa_amd_ais_file_read + */ +#define HSA_AMD_INTERFACE_VERSION_MAJOR 1 +#define HSA_AMD_INTERFACE_VERSION_MINOR 14 + +#ifdef __cplusplus +extern "C" { +#endif + +/** \addtogroup aql Architected Queuing Language + * @{ + */ + +/** + * @brief Macro to set a flag within uint8_t[8] types. + */ +static inline void hsa_flag_set64(uint8_t* value, uint32_t bit) { + unsigned int index = bit / 8; + unsigned int subBit = bit % 8; + (((uint8_t*)value)[index]) |= (1 << subBit); +} + +/** + * @brief Macro to determine whether a flag is set within uint8_t[8] types. + */ +static inline bool hsa_flag_isset64(uint8_t* value, uint32_t bit) { + unsigned int index = bit / 8; + unsigned int subBit = bit % 8; + return ((uint8_t*)value)[index] & (1 << subBit); +} + +/** + * @brief A fixed-size type used to represent ::hsa_signal_condition_t constants. + */ +typedef uint32_t hsa_signal_condition32_t; + +/** + * @brief AMD vendor specific packet type. + */ +typedef enum { + /** + * Packet used by agents to delay processing of subsequent packets until a + * configurable condition is satisfied by an HSA signal. Only kernel dispatch + * queues created from AMD GPU Agents support this packet. + */ + HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2, + /** + * Packet used to send commands to an AIE agent's embedded runtime (ERT). The + * ERT is responsible for, among other things, handling dispatches. Only + * queues created on AIE agents support this packet. + */ + HSA_AMD_PACKET_TYPE_AIE_ERT = 3 +} hsa_amd_packet_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_amd_packet_type_t constants. + */ +typedef uint8_t hsa_amd_packet_type8_t; + +/** + * @brief AMD vendor specific AQL packet header + */ +typedef struct hsa_amd_packet_header_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Format of the vendor specific packet. + */ + hsa_amd_packet_type8_t AmdFormat; + + /** + * Reserved. Must be 0. + */ + uint8_t reserved; +} hsa_amd_vendor_packet_header_t; + +/** + * @brief AMD barrier value packet. Halts packet processing and waits for + * (signal_value & ::mask) ::cond ::value to be satisfied, where signal_value + * is the value of the signal ::signal. + */ +typedef struct hsa_amd_barrier_value_packet_s { + /** + * AMD vendor specific packet header. + */ + hsa_amd_vendor_packet_header_t header; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; + + /** + * Dependent signal object. A signal with a handle value of 0 is + * allowed and is interpreted by the packet processor a satisfied + * dependency. + */ + hsa_signal_t signal; + + /** + * Value to compare against. + */ + hsa_signal_value_t value; + + /** + * Bit mask to be combined by bitwise AND with ::signal's value. + */ + hsa_signal_value_t mask; + + /** + * Comparison operation. See ::hsa_signal_condition_t. + */ + hsa_signal_condition32_t cond; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved3; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; +} hsa_amd_barrier_value_packet_t; + +/** + * State of an AIE ERT command. + */ +typedef enum { + /** + * Set by the host before submitting a command to the scheduler. + */ + HSA_AMD_AIE_ERT_STATE_NEW = 1, + /** + * Internal scheduler state. + */ + HSA_AMD_AIE_ERT_STATE_QUEUED = 2, + /** + * Internal scheduler state. + */ + HSA_AMD_AIE_ERT_STATE_RUNNING = 3, + /** + * Set by the scheduler when a command completes. + */ + HSA_AMD_AIE_ERT_STATE_COMPLETED = 4, + /** + * Set by the scheduler if a command failed. + */ + HSA_AMD_AIE_ERT_STATE_ERROR = 5, + /** + * Set by the scheduler if a command aborted. + */ + HSA_AMD_AIE_ERT_STATE_ABORT = 6, + /** + * Internal scheduler state. + */ + HSA_AMD_AIE_ERT_STATE_SUBMITTED = 7, + /** + * Set by the scheduler on a timeout and reset. + */ + HSA_AMD_AIE_ERT_STATE_TIMEOUT = 8, + /** + * Set by the scheduler on a timeout and fail to reset. + */ + HSA_AMD_AIE_ERT_STATE_NORESPONSE = 9, + HSA_AMD_AIE_ERT_STATE_SKERROR = 10, + HSA_AMD_AIE_ERT_STATE_SKCRASHED = 11, + HSA_AMD_AIE_ERT_STATE_MAX +} hsa_amd_aie_ert_state; + +/** + * Opcode types for HSA AIE ERT commands. + */ +typedef enum { + /** + * Start a workgroup on a compute unit (CU). + */ + HSA_AMD_AIE_ERT_START_CU = 0, + /** + * Currently aliased to HSA_AMD_AIE_ERT_START_CU. + */ + HSA_AMD_AIE_ERT_START_KERNEL = 0, + /** + * Configure command scheduler. + */ + HSA_AMD_AIE_ERT_CONFIGURE = 2, + HSA_AMD_AIE_ERT_EXIT = 3, + HSA_AMD_AIE_ERT_ABORT = 4, + /** + * Execute a specified CU after writing. + */ + HSA_AMD_AIE_ERT_EXEC_WRITE = 5, + /** + * Get stats about a CU's execution. + */ + HSA_AMD_AIE_ERT_CU_STAT = 6, + /** + * Start KDMA CU or P2P. + */ + HSA_AMD_AIE_ERT_START_COPYBO = 7, + /** + * Configure a soft kernel. + */ + HSA_AMD_AIE_ERT_SK_CONFIG = 8, + /** + * Start a soft kernel. + */ + HSA_AMD_AIE_ERT_SK_START = 9, + /** + * Unconfigure a soft kernel. + */ + HSA_AMD_AIE_ERT_SK_UNCONFIG = 10, + /** + * Initialize a CU. + */ + HSA_AMD_AIE_ERT_INIT_CU = 11, + HSA_AMD_AIE_ERT_START_FA = 12, + HSA_AMD_AIE_ERT_CLK_CALIB = 13, + HSA_AMD_AIE_ERT_MB_VALIDATE = 14, + /** + * Same as HSA_AMD_AIE_ERT_START_CU but with a key-value pair. + */ + HSA_AMD_AIE_ERT_START_KEY_VAL = 15, + HSA_AMD_AIE_ERT_ACCESS_TEST_C = 16, + HSA_AMD_AIE_ERT_ACCESS_TEST = 17, + /** + * Instruction buffer command format. + */ + HSA_AMD_AIE_ERT_START_DPU = 18, + /** + * Command chain. + */ + HSA_AMD_AIE_ERT_CMD_CHAIN = 19, + /** + * Instruction buffer command format on NPU. + */ + HSA_AMD_AIE_ERT_START_NPU = 20, + /** + * Instruction buffer command with pre-emption format on the NPU. + */ + HSA_AMD_AIE_ERT_START_NPU_PREEMPT = 21 +} hsa_amd_aie_ert_cmd_opcode_t; + +/** + * Payload data for AIE ERT start kernel packets (i.e., when the opcode is + * HSA_AMD_AIE_ERT_START_KERNEL). + */ +typedef struct hsa_amd_aie_ert_start_kernel_data_s { + /** + * Address to the PDI. + */ + void* pdi_addr; + /** + * Opcode, instructions and kernel arguments. + */ + uint32_t data[]; +} hsa_amd_aie_ert_start_kernel_data_t; + +/** + * AMD AIE ERT packet. Used for sending a command to an AIE agent. + */ +typedef struct hsa_amd_aie_ert_packet_s { + /** + * AMD vendor specific packet header. + */ + hsa_amd_vendor_packet_header_t header; + /** + * Format for packets interpreted by the ERT to understand the command and + * payload data. + */ + struct { + /** + * Current state of a command. + */ + uint32_t state : 4; + /** + * Flexible field that can be interpreted on a per-command basis. + */ + uint32_t custom : 8; + /** + * Number of DWORDs in the payload data. + */ + uint32_t count : 11; + /** + * Opcode identifying the command. + */ + uint32_t opcode : 5; + /** + * Type of a command (currently 0). + */ + uint32_t type : 4; + }; + /** + * Reserved. Must be 0. + */ + uint64_t reserved0; + /** + * Reserved. Must be 0. + */ + uint64_t reserved1; + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + /** + * Reserved. Must be 0. + */ + uint64_t reserved3; + /** + * Reserved. Must be 0. + */ + uint64_t reserved4; + /** + * Reserved. Must be 0. + */ + uint64_t reserved5; + /** + * Address of packet data payload. ERT commands contain arbitrarily sized + * data payloads. + */ + uint64_t payload_data; +} hsa_amd_aie_ert_packet_t; + +/** @} */ + +/** \defgroup error-codes Error codes + * @{ + */ + +/** + * @brief Enumeration constants added to ::hsa_status_t. + * + * @remark Additions to hsa_status_t + */ +enum { + /** + * The memory pool is invalid. + */ + HSA_STATUS_ERROR_INVALID_MEMORY_POOL = 40, + + /** + * Agent accessed memory beyond the maximum legal address. + */ + HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION = 41, + + /** + * Agent executed an invalid shader instruction. + */ + HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION = 42, + + /** + * Agent attempted to access an inaccessible address. + * See hsa_amd_register_system_event_handler and + * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses. + */ + HSA_STATUS_ERROR_MEMORY_FAULT = 43, + + /** + * The CU mask was successfully set but the mask attempted to enable a CU + * which was disabled for the process. CUs disabled for the process remain + * disabled. + */ + HSA_STATUS_CU_MASK_REDUCED = 44, + + /** + * Exceeded number of VGPRs available on this agent + */ + HSA_STATUS_ERROR_OUT_OF_REGISTERS = 45, + + /** + * Resource is busy or temporarily unavailable + */ + HSA_STATUS_ERROR_RESOURCE_BUSY = 46, + + /** + * Request is not supported by this system + */ + HSA_STATUS_ERROR_NOT_SUPPORTED = 47, +}; + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief IOMMU version supported + */ +typedef enum { + /** + * IOMMU not supported + */ + HSA_IOMMU_SUPPORT_NONE = 0, + /* IOMMU V1 support is not relevant to user applications, so not reporting it */ + /** + * IOMMU V2 supported + */ + HSA_IOMMU_SUPPORT_V2 = 1, +} hsa_amd_iommu_version_t; + +/** + * @brief Structure containing information on the agent's clock counters. + */ +typedef struct hsa_amd_clock_counters_s { + uint64_t gpu_clock_counter; + uint64_t cpu_clock_counter; + uint64_t system_clock_counter; + uint64_t system_clock_frequency; +} hsa_amd_clock_counters_t; + +/** + * @brief Agent attributes. + */ +typedef enum hsa_amd_agent_info_s { + /** + * Chip identifier. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000, + /** + * Size of a cacheline in bytes. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001, + /** + * The number of compute unit available in the agent. The type of this + * attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002, + /** + * The maximum clock frequency of the agent in MHz. The type of this + * attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003, + /** + * Internal driver node identifier. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_DRIVER_NODE_ID = 0xA004, + /** + * Max number of watch points on memory address ranges to generate exception + * events when the watched addresses are accessed. The type of this + * attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS = 0xA005, + /** + * Agent BDF_ID, named LocationID in thunk. The type of this attribute is + * uint32_t. + */ + HSA_AMD_AGENT_INFO_BDFID = 0xA006, + /** + * Memory Interface width, the return value type is uint32_t. + * This attribute is deprecated. + */ + HSA_AMD_AGENT_INFO_MEMORY_WIDTH = 0xA007, + /** + * Max Memory Clock, the return value type is uint32_t. + */ + HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY = 0xA008, + /** + * Board name of Agent - populated from MarketingName of Kfd Node + * The value is an Ascii string of 64 chars. + */ + HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009, + /** + * Maximum number of waves possible in a Compute Unit. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A, + /** + * Number of SIMD's per compute unit CU + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B, + /** + * Number of Shader Engines (SE) in Gpu + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES = 0xA00C, + /** + * Number of Shader Arrays Per Shader Engines in Gpu + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE = 0xA00D, + /** + * Address of the HDP flush registers. Use of these registers does not conform to the HSA memory + * model and should be treated with caution. + * The type of this attribute is hsa_amd_hdp_flush_t. + */ + HSA_AMD_AGENT_INFO_HDP_FLUSH = 0xA00E, + /** + * PCIe domain for the agent. Pairs with HSA_AMD_AGENT_INFO_BDFID + * to give the full physical location of the Agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_DOMAIN = 0xA00F, + /** + * Queries for support of cooperative queues. See ::HSA_QUEUE_TYPE_COOPERATIVE. + * The type of this attribute is bool. + */ + HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010, + /** + * Queries UUID of an agent. The value is an Ascii string with a maximum + * of 21 chars including NUL. The string value consists of two parts: header + * and body. The header identifies device type (GPU, CPU, DSP) while body + * encodes UUID as a 16 digit hex string + * + * Agents that do not support UUID will return the string "GPU-XX" or + * "CPU-XX" or "DSP-XX" depending upon their device type ::hsa_device_type_t + */ + HSA_AMD_AGENT_INFO_UUID = 0xA011, + /** + * Queries for the ASIC revision of an agent. The value is an integer that + * increments for each revision. This can be used by user-level software to + * change how it operates, depending on the hardware version. This allows + * selective workarounds for hardware errata. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_ASIC_REVISION = 0xA012, + /** + * Queries whether or not the host can directly access SVM memory that is + * physically resident in the agent's local memory. + * The type of this attribute is bool. + */ + HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS = 0xA013, + /** + * Some processors support more CUs than can reliably be used in a cooperative + * dispatch. This queries the count of CUs which are fully enabled for + * cooperative dispatch. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT = 0xA014, + /** + * Queries the amount of memory available in bytes accross all global pools + * owned by the agent. + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_MEMORY_AVAIL = 0xA015, + /** + * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is + * in the range 1-400MHz. + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016, + /** + * Queries for the ASIC family ID of an agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_ASIC_FAMILY_ID = 0xA107, + /** + * Queries for the Packet Processor(CP Firmware) ucode version of an agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_UCODE_VERSION = 0xA108, + /** + * Queries for the SDMA engine ucode of an agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_SDMA_UCODE_VERSION = 0xA109, + /** + * Queries the number of SDMA engines. + * If HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG query returns non-zero, + * this query returns the the number of SDMA engines optimized for + * host to device bidirectional traffic. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SDMA_ENG = 0xA10A, + /** + * Queries the number of additional SDMA engines optimized for D2D xGMI copies. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG = 0xA10B, + /** + * Queries for version of IOMMU supported by agent. + * The type of this attribute is hsa_amd_iommu_version_t. + */ + HSA_AMD_AGENT_INFO_IOMMU_SUPPORT = 0xA110, + /** + * Queries for number of XCCs within the agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_XCC = 0xA111, + /** + * Queries for driver unique identifier. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_DRIVER_UID = 0xA112, + /** + * Returns the hsa_agent_t of the nearest CPU agent + * The type of this attribute is hsa_agent_t. + */ + HSA_AMD_AGENT_INFO_NEAREST_CPU = 0xA113, + /** + * Bit-mask indicating memory properties of this agent. A memory property is set if the flag bit + * is set at that position. User may use the hsa_flag_isset64 macro to verify whether a flag + * is set. The type of this attribute is uint8_t[8]. + */ + HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES = 0xA114, + /** + * Bit-mask indicating AQL Extensions supported by this agent. An AQL extension is set if the flag + * bit is set at that position. User may use the hsa_flag_isset64 macro to verify whether a flag + * is set. The type of this attribute is uint8_t[8]. + */ + HSA_AMD_AGENT_INFO_AQL_EXTENSIONS = 0xA115, /* Not implemented yet */ + /** + * Maximum allowed value in bytes for scratch limit for this agent. This amount + * is shared accross all queues created on this agent. + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_MAX = 0xA116, + /** + * Current scratch limit threshold in bytes for this agent. This limit can be + * modified using the hsa_amd_agent_set_async_scratch_limit call. + * - AQL dispatches that require scratch-memory above this threshold will trigger a + * scratch use-once. + * - AQL dispatches using less scratch-memory than this threshold, ROCr will + * permanently assign the allocated scratch memory to the queue handling the dispatch. + * This memory can be reclaimed by calling hsa_amd_agent_set_async_scratch_limit + * with a lower threshold by current value. + * + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_CURRENT = 0xA117, + /** + * Queries the driver for clock counters of the agent. + * The type of this attribute is hsa_amd_clock_counters_t. + */ + HSA_AMD_AGENT_INFO_CLOCK_COUNTERS = 0xA118 +} hsa_amd_agent_info_t; + +/** + * @brief Agent memory properties attributes + */ +typedef enum hsa_amd_agent_memory_properties_s { + HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU = (1 << 0), +} hsa_amd_agent_memory_properties_t; + +/** + * @brief SDMA engine IDs unique by single set bit position. + */ +typedef enum hsa_amd_sdma_engine_id { + HSA_AMD_SDMA_ENGINE_0 = 0x1, + HSA_AMD_SDMA_ENGINE_1 = 0x2, + HSA_AMD_SDMA_ENGINE_2 = 0x4, + HSA_AMD_SDMA_ENGINE_3 = 0x8, + HSA_AMD_SDMA_ENGINE_4 = 0x10, + HSA_AMD_SDMA_ENGINE_5 = 0x20, + HSA_AMD_SDMA_ENGINE_6 = 0x40, + HSA_AMD_SDMA_ENGINE_7 = 0x80, + HSA_AMD_SDMA_ENGINE_8 = 0x100, + HSA_AMD_SDMA_ENGINE_9 = 0x200, + HSA_AMD_SDMA_ENGINE_10 = 0x400, + HSA_AMD_SDMA_ENGINE_11 = 0x800, + HSA_AMD_SDMA_ENGINE_12 = 0x1000, + HSA_AMD_SDMA_ENGINE_13 = 0x2000, + HSA_AMD_SDMA_ENGINE_14 = 0x4000, + HSA_AMD_SDMA_ENGINE_15 = 0x8000 +} hsa_amd_sdma_engine_id_t; + +typedef struct hsa_amd_hdp_flush_s { + uint32_t* HDP_MEM_FLUSH_CNTL; + uint32_t* HDP_REG_FLUSH_CNTL; +} hsa_amd_hdp_flush_t; + +/** + * @brief Region attributes. + */ +#ifdef __cplusplus +typedef enum hsa_amd_region_info_s : int { +#else +typedef enum hsa_amd_region_info_s { +#endif + /** + * Determine if host can access the region. The type of this attribute + * is bool. + */ + HSA_AMD_REGION_INFO_HOST_ACCESSIBLE = 0xA000, + /** + * Base address of the region in flat address space. + */ + HSA_AMD_REGION_INFO_BASE = 0xA001, + /** + * Memory Interface width, the return value type is uint32_t. + * This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_WIDTH. + */ + HSA_AMD_REGION_INFO_BUS_WIDTH = 0xA002, + /** + * Max Memory Clock, the return value type is uint32_t. + * This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY. + */ + HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY = 0xA003, +} hsa_amd_region_info_t; + +/** + * @brief Coherency attributes of fine grain region. + */ +typedef enum hsa_amd_coherency_type_s { + /** + * Coherent region. + */ + HSA_AMD_COHERENCY_TYPE_COHERENT = 0, + /** + * Non coherent region. + */ + HSA_AMD_COHERENCY_TYPE_NONCOHERENT = 1 +} hsa_amd_coherency_type_t; + + +/** + * @brief dmabuf attributes + */ +#ifdef __cplusplus +typedef enum hsa_amd_dma_buf_mapping_type_s : int { +#else +typedef enum hsa_amd_dma_buf_mapping_type_s { +#endif + HSA_AMD_DMABUF_MAPPING_TYPE_NONE = 0, + HSA_AMD_DMABUF_MAPPING_TYPE_PCIE = 1 +} hsa_amd_dma_buf_mapping_type_t; +/** + * @brief Get the coherency type of the fine grain region of an agent. + * + * @param[in] agent A valid agent. + * + * @param[out] type Pointer to a memory location where the HSA runtime will + * store the coherency type of the fine grain region. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is NULL. + */ +hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent, + hsa_amd_coherency_type_t* type); + +/** + * @brief Set the coherency type of the fine grain region of an agent. + * Deprecated. This is supported on KV platforms. For backward compatibility + * other platforms will spuriously succeed. + * + * @param[in] agent A valid agent. + * + * @param[in] type The coherency type to be set. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is invalid. + */ +hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent, + hsa_amd_coherency_type_t type); + +/** @} */ + +/** \defgroup profile Profiling + * @{ + */ + +/** + * @brief Structure containing profiling dispatch time information. + * + * Times are reported as ticks in the domain of the HSA system clock. + * The HSA system clock tick and frequency is obtained via hsa_system_get_info. + */ +typedef struct hsa_amd_profiling_dispatch_time_s { + /** + * Dispatch packet processing start time. + */ + uint64_t start; + /** + * Dispatch packet completion time. + */ + uint64_t end; +} hsa_amd_profiling_dispatch_time_t; + +/** + * @brief Structure containing profiling async copy time information. + * + * Times are reported as ticks in the domain of the HSA system clock. + * The HSA system clock tick and frequency is obtained via hsa_system_get_info. + */ +typedef struct hsa_amd_profiling_async_copy_time_s { + /** + * Async copy processing start time. + */ + uint64_t start; + /** + * Async copy completion time. + */ + uint64_t end; +} hsa_amd_profiling_async_copy_time_t; + +/** + * @brief Enable or disable profiling capability of a queue. + * + * @param[in] queue A valid queue. + * + * @param[in] enable 1 to enable profiling. 0 to disable profiling. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API + hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable); + +/** + * @brief Enable or disable asynchronous memory copy profiling. + * + * @details The runtime will provide the copy processing start timestamp and + * completion timestamp of each call to hsa_amd_memory_async_copy if the + * async copy profiling is enabled prior to the call to + * hsa_amd_memory_async_copy. The completion signal object is used to + * hold the last async copy start and end timestamp. The client can retrieve + * these timestamps via call to hsa_amd_profiling_get_async_copy_time. + * + * @param[in] enable True to enable profiling. False to disable profiling. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed on allocating resources + * needed to profile the asynchronous copy. + */ +hsa_status_t HSA_API + hsa_amd_profiling_async_copy_enable(bool enable); + +/** + * @brief Retrieve packet processing time stamps. + * + * @param[in] agent The agent with which the signal was last used. For + * instance, if the profiled dispatch packet is dispatched onto queue Q, + * which was created on agent A, then this parameter must be A. + * + * @param[in] signal A signal used as the completion signal of the dispatch + * packet to retrieve time stamps from. This dispatch packet must have been + * issued to a queue with profiling enabled and have already completed. Also + * the signal must not have yet been used in any other packet following the + * completion of the profiled dispatch packet. + * + * @param[out] time Packet processing timestamps in the HSA system clock + * domain. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL. + */ +hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time( + hsa_agent_t agent, hsa_signal_t signal, + hsa_amd_profiling_dispatch_time_t* time); + +/** + * @brief Retrieve asynchronous copy timestamps. + * + * @details Async copy profiling is enabled via call to + * hsa_amd_profiling_async_copy_enable. + * + * @param[in] signal A signal used as the completion signal of the call to + * hsa_amd_memory_async_copy. + * + * @param[out] time Async copy processing timestamps in the HSA system clock + * domain. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL. + */ +hsa_status_t HSA_API hsa_amd_profiling_get_async_copy_time( + hsa_signal_t signal, hsa_amd_profiling_async_copy_time_t* time); + +/** + * @brief Computes the frequency ratio and offset between the agent clock and + * HSA system clock and converts the agent's tick to HSA system domain tick. + * + * @param[in] agent The agent used to retrieve the agent_tick. It is user's + * responsibility to make sure the tick number is from this agent, otherwise, + * the behavior is undefined. + * + * @param[in] agent_tick The tick count retrieved from the specified @p agent. + * + * @param[out] system_tick The translated HSA system domain clock counter tick. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p system_tick is NULL; + */ +hsa_status_t HSA_API + hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent, + uint64_t agent_tick, + uint64_t* system_tick); + +/** @} */ + +/** \defgroup status Runtime notifications + * @{ + */ + +/** + * @brief Signal attribute flags. + */ +typedef enum { + /** + * Signal will only be consumed by AMD GPUs. Limits signal consumption to + * AMD GPU agents only. Ignored if @p num_consumers is not zero (all agents). + */ + HSA_AMD_SIGNAL_AMD_GPU_ONLY = 1, + /** + * Signal may be used for interprocess communication. + * IPC signals can be read, written, and waited on from any process. + * Profiling using an IPC enabled signal is only supported in a single process + * at a time. Producing profiling data in one process and consuming it in + * another process is undefined. + */ + HSA_AMD_SIGNAL_IPC = 2, +} hsa_amd_signal_attribute_t; + +/** + * @brief Create a signal with specific attributes. + * + * @param[in] initial_value Initial value of the signal. + * + * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that + * any agent might wait on the signal. + * + * @param[in] consumers List of agents that might consume (wait on) the + * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the + * HSA runtime might use the list to optimize the handling of the signal + * object. If an agent not listed in @p consumers waits on the returned + * signal, the behavior is undefined. The memory associated with @p consumers + * can be reused or freed after the function returns. + * + * @param[in] attributes Requested signal attributes. Multiple signal attributes + * may be requested by combining them with bitwise OR. Requesting no attributes + * (@p attributes == 0) results in the same signal as would have been obtained + * via hsa_signal_create. + * + * @param[out] signal Pointer to a memory location where the HSA runtime will + * store the newly created signal handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p + * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers + * contains duplicates. + */ +hsa_status_t HSA_API hsa_amd_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers, + const hsa_agent_t* consumers, uint64_t attributes, + hsa_signal_t* signal); + +/** + * @brief Returns a pointer to the value of a signal. + * + * Use of this API does not modify the lifetime of ::signal and any + * hsa_signal_value_t retrieved by this API has lifetime equal to that of + * ::signal. + * + * This API is intended for partial interoperability with non-HSA compatible + * devices and should not be used where HSA interfaces are available. + * + * Use of the signal value must comply with use restritions of ::signal. + * Use may result in data races if the operations performed are not platform + * atomic. Use with HSA_AMD_SIGNAL_AMD_GPU_ONLY or HSA_AMD_SIGNAL_IPC + * attributed signals is required. + * + * @param[in] Signal handle to extract the signal value pointer from. + * + * @param[out] Location where the extracted signal value pointer will be placed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT value_ptr is NULL. + */ +hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal, + volatile hsa_signal_value_t** value_ptr); + +/** + * @brief Asyncronous signal handler function type. + * + * @details Type definition of callback function to be used with + * hsa_amd_signal_async_handler. This callback is invoked if the associated + * signal and condition are met. The callback receives the value of the signal + * which satisfied the associated wait condition and a user provided value. If + * the callback returns true then the callback will be called again if the + * associated signal and condition are satisfied again. If the callback returns + * false then it will not be called again. + * + * @param[in] value Contains the value of the signal observed by + * hsa_amd_signal_async_handler which caused the signal handler to be invoked. + * + * @param[in] arg Contains the user provided value given when the signal handler + * was registered with hsa_amd_signal_async_handler + * + * @retval true resumes monitoring the signal with this handler (as if calling + * hsa_amd_signal_async_handler again with identical parameters) + * + * @retval false stops monitoring the signal with this handler (handler will + * not be called again for this signal) + * + */ +typedef bool (*hsa_amd_signal_handler)(hsa_signal_value_t value, void* arg); + +/** + * @brief Register asynchronous signal handler function. + * + * @details Allows registering a callback function and user provided value with + * a signal and wait condition. The callback will be invoked if the associated + * signal and wait condition are satisfied. Callbacks will be invoked serially + * but in an arbitrary order so callbacks should be independent of each other. + * After being invoked a callback may continue to wait for its associated signal + * and condition and, possibly, be invoked again. Or the callback may stop + * waiting. If the callback returns true then it will continue waiting and may + * be called again. If false then the callback will not wait again and will not + * be called again for the associated signal and condition. It is possible to + * register the same callback multiple times with the same or different signals + * and/or conditions. Each registration of the callback will be treated entirely + * independently. + * + * @param[in] signal hsa signal to be asynchronously monitored + * + * @param[in] cond condition value to monitor for + * + * @param[in] value signal value used in condition expression + * + * @param[in] handler asynchronous signal handler invoked when signal's + * condition is met + * + * @param[in] arg user provided value which is provided to handler when handler + * is invoked + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL) + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of + * resources or blocking signals are not supported by the HSA driver component. + * + */ +hsa_status_t HSA_API + hsa_amd_signal_async_handler(hsa_signal_t signal, + hsa_signal_condition_t cond, + hsa_signal_value_t value, + hsa_amd_signal_handler handler, void* arg); + +/** + * @brief Wait for all signal-condition pairs to be satisfied. + * + * @details Allows waiting for all of several signal and condition pairs to be + * satisfied. The function returns 0 if all signals met their conditions and -1 + * on a timeout. The value of each signal's satisfying value is returned in + * satisfying_value unless satisfying_value is nullptr. NULL and invalid signals + * are considered to have value 0 and their conditions already satisfied. This + * function provides only relaxed memory semantics. + */ +uint32_t HSA_API hsa_amd_signal_wait_all(uint32_t signal_count, hsa_signal_t* signals, + hsa_signal_condition_t* conds, hsa_signal_value_t* values, + uint64_t timeout_hint, hsa_wait_state_t wait_hint, + hsa_signal_value_t* satisfying_values); + +/** + * @brief Wait for any signal-condition pair to be satisfied. + * + * @details Allows waiting for any of several signal and conditions pairs to be + * satisfied. The function returns the index into the list of signals of the + * first satisfying signal-condition pair. The function returns + * std::numeric_limits::max() if no valid signal is provided. The value + * of the satisfying signal's value is returned in satisfying_value, unless + * satisfying_value is nullptr or there's no valid signal in the signal-condition + * pairs. NULL and invalid signals are ignored. This function provides only + * relaxed memory semantics. + */ +uint32_t HSA_API + hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals, + hsa_signal_condition_t* conds, + hsa_signal_value_t* values, uint64_t timeout_hint, + hsa_wait_state_t wait_hint, + hsa_signal_value_t* satisfying_value); + +/** @} */ + +/** + * @brief Call a function asynchronously + * + * @details Provides access to the runtime's asynchronous event handling thread + * for general asynchronous functions. Functions queued this way are executed + * in the same manner as if they were a signal handler who's signal is + * satisfied. + * + * @param[in] callback asynchronous function to be invoked + * + * @param[in] arg user provided value which is provided to handler when handler + * is invoked + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL) + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of + * resources or blocking signals are not supported by the HSA driver component. + * + */ +hsa_status_t HSA_API + hsa_amd_async_function(void (*callback)(void* arg), void* arg); + +/** \addtogroup ext-images Images and samplers + * @{ + */ + +/** + * @brief Encodes an opaque vendor specific image format. The length of data + * depends on the underlying format. This structure must not be copied as its + * true length can not be determined. + */ +typedef struct hsa_amd_image_descriptor_s { + /* + Version number of the descriptor + */ + uint32_t version; + + /* + Vendor and device PCI IDs for the format as VENDOR_ID<<16|DEVICE_ID. + */ + uint32_t deviceID; + + /* + Start of vendor specific data. + */ + uint32_t data[1]; +} hsa_amd_image_descriptor_t; + +/** + * @brief Creates an image from an opaque vendor specific image format. + * Does not modify data at image_data. Intended initially for + * accessing interop images. + * + * @param agent[in] Agent on which to create the image + * + * @param[in] image_descriptor[in] Vendor specific image format + * + * @param[in] image_data Pointer to image backing store + * + * @param[in] access_permission Access permissions for the image object + * + * @param[out] image Created image object. + * + * @retval HSA_STATUS_SUCCESS Image created successfully + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT Bad or mismatched descriptor, + * null image_data, or mismatched access_permission. + */ +hsa_status_t HSA_API hsa_amd_image_create( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const hsa_amd_image_descriptor_t *image_layout, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image +); + +/** + * @brief Query image limits. + * + * @param[in] agent A valid agent. + * + * @param[in] attribute HSA image info attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p value is NULL or @p attribute < + * HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS or @p attribute > + * HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS. + * + */ +hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +/** @} */ + +/** \addtogroup queue Queues + * @{ + */ + +/** + * @brief Set a queue's CU affinity mask. + * + * @details Enables the queue to run on only selected CUs. The given mask is + * combined by bitwise AND with any device wide mask in HSA_CU_MASK before + * being applied. + * If num_cu_mask_count is 0 then the request is interpreted as a request to + * enable all CUs and no cu_mask array need be given. + * + * @param[in] queue A pointer to HSA queue. + * + * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits. + * + * @param[in] cu_mask Bit-vector representing the CU mask. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed + * but the given mask attempted to enable a CU which was disabled by + * HSA_CU_MASK. CUs disabled by HSA_CU_MASK remain disabled. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not + * a multiple of 32 or @p num_cu_mask_count is not 0 and cu_mask is NULL. + * Devices with work group processors must even-index contiguous pairwise + * CU enable e.g. 0x33(b'110011) is valid while 0x5(0x101) and 0x6(b'0110) + * are invalid. + * + */ +hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, + uint32_t num_cu_mask_count, + const uint32_t* cu_mask); + +/** + * @brief Retrieve a queue's CU affinity mask. + * + * @details Returns the first num_cu_mask_count bits of a queue's CU mask. + * Ensure that num_cu_mask_count is at least as large as + * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask. + * + * @param[in] queue A pointer to HSA queue. + * + * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits. + * + * @param[out] cu_mask Bit-vector representing the CU mask. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not + * a multiple of 32 or @p cu_mask is NULL. + * + */ +hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, + uint32_t* cu_mask); + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Memory segments associated with a memory pool. + */ +typedef enum { + /** + * Global segment. Used to hold data that is shared by all agents. + */ + HSA_AMD_SEGMENT_GLOBAL = 0, + /** + * Read-only segment. Used to hold data that remains constant during the + * execution of a kernel. + */ + HSA_AMD_SEGMENT_READONLY = 1, + /** + * Private segment. Used to hold data that is local to a single work-item. + */ + HSA_AMD_SEGMENT_PRIVATE = 2, + /** + * Group segment. Used to hold data that is shared by the work-items of a + * work-group. + */ + HSA_AMD_SEGMENT_GROUP = 3, +} hsa_amd_segment_t; + +/** + * @brief A memory pool encapsulates physical storage on an agent + * along with a memory access model. + * + * @details A memory pool encapsulates a physical partition of an agent's + * memory system along with a memory access model. Division of a single + * memory system into separate pools allows querying each partition's access + * path properties (see ::hsa_amd_agent_memory_pool_get_info). Allocations + * from a pool are preferentially bound to that pool's physical partition. + * Binding to the pool's preferential physical partition may not be + * possible or persistent depending on the system's memory policy + * and/or state which is beyond the scope of HSA APIs. + * + * For example, a multi-node NUMA memory system may be represented by multiple + * pool's with each pool providing size and access path information for the + * partition it represents. Allocations from a pool are preferentially bound + * to the pool's partition (which in this example is a NUMA node) while + * following its memory access model. The actual placement may vary or migrate + * due to the system's NUMA policy and state, which is beyond the scope of + * HSA APIs. + */ +typedef struct hsa_amd_memory_pool_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_amd_memory_pool_t; + +typedef enum hsa_amd_memory_pool_global_flag_s { + /** + * The application can use allocations in the memory pool to store kernel + * arguments, and provide the values for the kernarg segment of + * a kernel dispatch. + */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1, + /** + * Updates to memory in this pool conform to HSA memory consistency model. + * If this flag is set, then ::HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED + * must not be set. + */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2, + /** + * Writes to memory in this pool can be performed by a single agent at a time. + */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4, + + /** Updates to memory in this memory pool have extended scope, acting as + * system-scope atomics for variables in memory regions of this type. + * Note: On non-compliant systems, device-specific actions may be required + * for system-scope coherence. */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8, + +} hsa_amd_memory_pool_global_flag_t; + +typedef enum hsa_amd_memory_pool_location_s { + /** + * This memory pool resides on the host (CPU) + */ + HSA_AMD_MEMORY_POOL_LOCATION_CPU = 0, + /** + * This memory pool resides on a GPU + */ + HSA_AMD_MEMORY_POOL_LOCATION_GPU = 1 +} hsa_amd_memory_pool_location_t; + +/** + * @brief Memory pool features. + */ +typedef enum { + /** + * Segment where the memory pool resides. The type of this attribute is + * ::hsa_amd_segment_t. + */ + HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0, + /** + * Flag mask. The value of this attribute is undefined if the value of + * ::HSA_AMD_MEMORY_POOL_INFO_SEGMENT is not ::HSA_AMD_SEGMENT_GLOBAL. The type + * of + * this attribute is uint32_t, a bit-field of + * ::hsa_amd_memory_pool_global_flag_t + * values. + */ + HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1, + /** + * Size of this pool, in bytes. The type of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_SIZE = 2, + /** + * Indicates whether memory in this pool can be allocated using + * ::hsa_amd_memory_pool_allocate. The type of this attribute is bool. + * + * The value of this flag is always false for memory pools in the group and + * private segments. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5, + /** + * Allocation granularity of buffers allocated by + * ::hsa_amd_memory_pool_allocate + * in this memory pool. The size of a buffer allocated in this pool is a + * multiple of the value of this attribute. While this is the minimum size of + * allocation allowed, it is recommened to use + * HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE to obtain the recommended + * allocation granularity size for this pool. + * The value of this attribute is only defined if + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for + * this pool. The type of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6, + /** + * Alignment of buffers allocated by ::hsa_amd_memory_pool_allocate in this + * pool. The value of this attribute is only defined if + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool, and + * must be a power of 2. The type of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7, + /** + * This memory_pool can be made directly accessible by all the agents in the + * system (::hsa_amd_agent_memory_pool_get_info does not return + * ::HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED for any agent). The type of this + * attribute is bool. + */ + HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15, + /** + * Maximum aggregate allocation size in bytes. The type of this attribute + * is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16, + /** + * Location of this memory pool. The type of this attribute + * is hsa_amd_memory_pool_location_t. + */ + HSA_AMD_MEMORY_POOL_INFO_LOCATION = 17, + /** + * Internal block size for allocations. This would also be the recommended + * granularity size for allocations as this prevents internal fragmentation. + * The value of this attribute is only defined if + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool. + * The size of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE = 18, +} hsa_amd_memory_pool_info_t; + +/** + * @brief Memory pool flag used to specify allocation directives + * + */ +typedef enum hsa_amd_memory_pool_flag_s { + /** + * Allocates memory that conforms to standard HSA memory consistency model + */ + HSA_AMD_MEMORY_POOL_STANDARD_FLAG = 0, + /** + * Allocates fine grain memory type where memory ordering is per point to point + * connection. Atomic memory operations on these memory buffers are not + * guaranteed to be visible at system scope. + */ + HSA_AMD_MEMORY_POOL_PCIE_FLAG = (1 << 0), + /** + * Allocates physically contiguous memory + */ + HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG = (1 << 1), + /** + * Allocates executable memory + */ + HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG = (1 << 2), + /** + * Allocates uncached memory + */ + HSA_AMD_MEMORY_POOL_UNCACHED_FLAG = (1 << 3), +} hsa_amd_memory_pool_flag_t; + +/** + * @brief Get the current value of an attribute of a memory pool. + * + * @param[in] memory_pool A valid memory pool. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + */ +hsa_status_t HSA_API + hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool, + hsa_amd_memory_pool_info_t attribute, + void* value); + +/** + * @brief Iterate over the memory pools associated with a given agent, and + * invoke an application-defined callback on every iteration. + * + * @details An agent can directly access buffers located in some memory pool, or + * be enabled to access them by the application (see ::hsa_amd_agents_allow_access), + * yet that memory pool may not be returned by this function for that given + * agent. + * + * A memory pool of fine-grained type must be associated only with the host. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked on the same thread that called + * ::hsa_amd_agent_iterate_memory_pools, serially, once per memory pool that is + * associated with the agent. The HSA runtime passes two arguments to the + * callback: the memory pool, and the application data. If @p callback + * returns a status other than ::HSA_STATUS_SUCCESS for a particular iteration, + * the traversal stops and ::hsa_amd_agent_iterate_memory_pools returns that status + * value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data), + void* data); + +/** + * @brief Allocate a block of memory (or buffer) in the specified pool. + * + * @param[in] memory_pool Memory pool where to allocate memory from. The memory + * pool must have the ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED flag set. + * + * @param[in] size Allocation size, in bytes. Must not be zero. This value is + * rounded up to the nearest multiple of + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool. + * + * @param[in] flags A bit-field that is used to specify allocation + * directives. + * + * @param[out] ptr Pointer to the location where to store the base virtual + * address of + * the allocated block. The returned base address is aligned to the value of + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT in @p memory_pool. If the + * allocation fails, the returned value is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES No memory is available. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The memory pool is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to + * allocate memory in @p memory_pool, or @p size is greater than + * the value of HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE in @p memory_pool. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0, + * or flags is not 0. + * + */ +hsa_status_t HSA_API + hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size, + uint32_t flags, void** ptr); + +/** + * @brief Deallocate a block of memory previously allocated using + * ::hsa_amd_memory_pool_allocate. + * + * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value + * previously returned by ::hsa_amd_memory_pool_allocate, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); + +/** + * @brief Asynchronously copy a block of memory from the location pointed to by + * @p src on the @p src_agent to the memory block pointed to by @p dst on the @p + * dst_agent. + * Because the DMA engines used may not be in the same coherency domain, the caller must ensure + * that buffers are system-level coherent. In general this requires the sending device to have + * released the buffer to system scope prior to executing the copy API and the receiving device + * must execute a system scope acquire fence prior to use of the destination buffer. + * + * @param[out] dst Buffer where the content is to be copied. + * + * @param[in] dst_agent Agent associated with the @p dst. The agent must be able to directly + * access both the source and destination buffers in their current locations. + * May be zero in which case the runtime will attempt to discover the destination agent. + * Discovery may have variable and/or high latency. + * + * @param[in] src A valid pointer to the source of data to be copied. The source + * buffer must not overlap with the destination buffer, otherwise the copy will succeed + * but contents of @p dst is undefined. + * + * @param[in] src_agent Agent associated with the @p src. The agent must be able to directly + * access both the source and destination buffers in their current locations. + * May be zero in which case the runtime will attempt to discover the destination agent. + * Discovery may have variable and/or high latency. + * + * @param[in] size Number of bytes to copy. If @p size is 0, no copy is + * performed and the function returns success. Copying a number of bytes larger + * than the size of the buffers pointed by @p dst or @p src results in undefined + * behavior. + * + * @param[in] num_dep_signals Number of dependent signals. Can be 0. + * + * @param[in] dep_signals List of signals that must be waited on before the copy + * operation starts. The copy will start after every signal has been observed with + * the value 0. The dependent signal should not include completion signal from + * hsa_amd_memory_async_copy operation to be issued in future as that can result + * in a deadlock. If @p num_dep_signals is 0, this argument is ignored. + * + * @param[in] completion_signal Signal used to indicate completion of the copy + * operation. When the copy operation is finished, the value of the signal is + * decremented. The runtime indicates that an error has occurred during the copy + * operation by setting the value of the completion signal to a negative + * number. The signal handle must not be 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. The + * application is responsible for checking for asynchronous error conditions + * (see the description of @p completion_signal). + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT An agent is invalid or no discovered agent has access. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p completion_signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination + * pointers are NULL, or the completion signal is 0. + */ +hsa_status_t HSA_API + hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + +/** + * @brief Asynchronously copy a block of memory from the location pointed to by + * @p src on the @p src_agent to the memory block pointed to by @p dst on the @p + * dst_agent on engine_id. + * + * WARNING: Concurrent use of this call with hsa_amd_memory_async_copy can result + * in resource conflicts as HSA runtime will auto assign engines with the latter + * call. Approach using both calls concurrently with caution. + * + * All param definitions are identical to hsa_amd_memory_async_copy with the + * exception of engine_id and force_copy_on_sdma. + * + * @param[in] - engine_id Target engine defined by hsa_amd_sdma_engine_id_t. + * Client should use hsa_amd_memory_copy_engine_status first to get the ID + * availability. + * + * @param[in] - force_copy_on_sdma By default, blit kernel copies are used when + * dst_agent == src_agent. Setting this to true will force the copy over SDMA1. + * + * All return definitions are identical to hsa_amd_memory_async_copy with the + * following ammendments: + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination + * pointers are NULL, or the completion signal is 0 or engine_id is improperly + * bounded. + */ +hsa_status_t HSA_API + hsa_amd_memory_async_copy_on_engine(void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal, + hsa_amd_sdma_engine_id_t engine_id, + bool force_copy_on_sdma); +/** + * @brief Reports the availability of SDMA copy engines. + * + * @param[in] dst_agent Destination agent of copy status direction. + * + * @param[in] src_agent Source agent of copy status direction. + * + * @param[out] engine_ids_mask returns available SDMA engine IDs that can be masked + * with hsa_amd_sdma_engine_id_t. + * + * @retval ::HSA_STATUS_SUCCESS Agent has available SDMA engines. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Agent does not have available SDMA engines. + * + */ +hsa_status_t HSA_API +hsa_amd_memory_copy_engine_status(hsa_agent_t dst_agent, hsa_agent_t src_agent, + uint32_t *engine_ids_mask); + /** + * @brief Returns the preferred SDMA engine mask. + * + * @param[in] dst_agent Destination agent of copy status direction. + * + * @param[in] src_agent Source agent of copy status direction. + * + * @param[out] recommended_ids_mask returns available SDMA engine IDs for max bandwidth + * that can be masked with hsa_amd_sdma_engine_id_t. Can be 0 if there is no preference + * + * @retval ::HSA_STATUS_SUCCESS For mask returned + * + */ +hsa_status_t HSA_API +hsa_amd_memory_get_preferred_copy_engine(hsa_agent_t dst_agent, hsa_agent_t src_agent, + uint32_t* recommended_ids_mask); + +/* +[Provisional API] +Pitched memory descriptor. +All elements must be 4 byte aligned. Pitch and slice are in bytes. +*/ +typedef struct hsa_pitched_ptr_s { + void* base; + size_t pitch; + size_t slice; +} hsa_pitched_ptr_t; + +/* +[Provisional API] +Copy direction flag. +*/ +typedef enum { + hsaHostToHost = 0, + hsaHostToDevice = 1, + hsaDeviceToHost = 2, + hsaDeviceToDevice = 3 +} hsa_amd_copy_direction_t; + +/* +[Provisional API] +SDMA 3D memory copy API. The same requirements must be met by src and dst as in +hsa_amd_memory_async_copy. +Both src and dst must be directly accessible to the copy_agent during the copy, src and dst rects +must not overlap. +CPU agents are not supported. API requires SDMA and will return an error if SDMA is not available. +Offsets and range carry x in bytes, y and z in rows and layers. +*/ +hsa_status_t HSA_API hsa_amd_memory_async_copy_rect( + const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent, + hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + +/** + * @brief Type of accesses to a memory pool from a given agent. + */ +typedef enum { + /** + * The agent cannot directly access any buffer in the memory pool. + */ + HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0, + /** + * The agent can directly access a buffer located in the pool; the application + * does not need to invoke ::hsa_amd_agents_allow_access. + */ + HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT = 1, + /** + * The agent can directly access a buffer located in the pool, but only if the + * application has previously requested access to that buffer using + * ::hsa_amd_agents_allow_access. + */ + HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT = 2 +} hsa_amd_memory_pool_access_t; + +/** + * @brief Properties of the relationship between an agent a memory pool. + */ +typedef enum { + /** + * Hyper-transport bus type. + */ + HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0, + + /** + * QPI bus type. + */ + HSA_AMD_LINK_INFO_TYPE_QPI = 1, + + /** + * PCIe bus type. + */ + HSA_AMD_LINK_INFO_TYPE_PCIE = 2, + + /** + * Infiniband bus type. + */ + HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3, + + /** + * xGMI link type. + */ + HSA_AMD_LINK_INFO_TYPE_XGMI = 4 + +} hsa_amd_link_info_type_t; + +/** + * @brief Link properties when accessing the memory pool from the specified + * agent. + */ +typedef struct hsa_amd_memory_pool_link_info_s { + /** + * Minimum transfer latency (rounded to ns). + */ + uint32_t min_latency; + + /** + * Maximum transfer latency (rounded to ns). + */ + uint32_t max_latency; + + /** + * Minimum link interface bandwidth in MB/s. + */ + uint32_t min_bandwidth; + + /** + * Maximum link interface bandwidth in MB/s. + */ + uint32_t max_bandwidth; + + /** + * Support for 32-bit atomic transactions. + */ + bool atomic_support_32bit; + + /** + * Support for 64-bit atomic transactions. + */ + bool atomic_support_64bit; + + /** + * Support for cache coherent transactions. + */ + bool coherent_support; + + /** + * The type of bus/link. + */ + hsa_amd_link_info_type_t link_type; + + /** + * NUMA distance of memory pool relative to querying agent + */ + uint32_t numa_distance; +} hsa_amd_memory_pool_link_info_t; + +/** + * @brief Properties of the relationship between an agent a memory pool. + */ +typedef enum { + /** + * Access to buffers located in the memory pool. The type of this attribute + * is ::hsa_amd_memory_pool_access_t. + * + * An agent can always directly access buffers currently located in a memory + * pool that is associated (the memory_pool is one of the values returned by + * ::hsa_amd_agent_iterate_memory_pools on the agent) with that agent. If the + * buffer is currently located in a memory pool that is not associated with + * the agent, and the value returned by this function for the given + * combination of agent and memory pool is not + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, the application still needs to invoke + * ::hsa_amd_agents_allow_access in order to gain direct access to the buffer. + * + * If the given agent can directly access buffers the pool, the result is not + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is associated with + * the agent, or it is of fined-grained type, the result must not be + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is not associated + * with the agent, and does not reside in the global segment, the result must + * be HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. + */ + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0, + + /** + * Number of links to hop when accessing the memory pool from the specified + * agent. The value of this attribute is zero if the memory pool is associated + * with the agent, or if the access type is + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. The type of this attribute is + * uint32_t. + */ + HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS = 1, + + /** + * Details of each link hop when accessing the memory pool starting from the + * specified agent. The type of this attribute is an array size of + * HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS with each element containing + * ::hsa_amd_memory_pool_link_info_t. + */ + HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO = 2 + +} hsa_amd_agent_memory_pool_info_t; + +/** + * @brief Get the current value of an attribute of the relationship between an + * agent and a memory pool. + * + * @param[in] agent Agent. + * + * @param[in] memory_pool Memory pool. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + */ +hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info( + hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool, + hsa_amd_agent_memory_pool_info_t attribute, void* value); + +/** + * @brief Enable direct access to a buffer from a given set of agents. + * + * @details + * + * Upon return, only the listed agents and the agent associated with the + * buffer's memory pool have direct access to the @p ptr. + * + * Any agent that has access to the buffer before and after the call to + * ::hsa_amd_agents_allow_access will also have access while + * ::hsa_amd_agents_allow_access is in progress. + * + * The caller is responsible for ensuring that each agent in the list + * must be able to access the memory pool containing @p ptr + * (using ::hsa_amd_agent_memory_pool_get_info with ::HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS attribute), + * otherwise error code is returned. + * + * @param[in] num_agents Size of @p agents. + * + * @param[in] agents List of agents. If @p num_agents is 0, this argument is + * ignored. + * + * @param[in] flags A list of bit-field that is used to specify access + * information in a per-agent basis. This is currently reserved and must be NULL. + * + * @param[in] ptr A buffer previously allocated using ::hsa_amd_memory_pool_allocate. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_agents is 0, or @p agents + * is NULL, @p flags is not NULL, or attempting to enable access to agent(s) + * because @p ptr is allocated from an inaccessible pool. + * + */ +hsa_status_t HSA_API + hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents, + const uint32_t* flags, const void* ptr); + +/** + * @brief Query if buffers currently located in some memory pool can be + * relocated to a destination memory pool. + * + * @details If the returned value is non-zero, a migration of a buffer to @p + * dst_memory_pool using ::hsa_amd_memory_migrate may nevertheless fail due to + * resource limitations. + * + * @param[in] src_memory_pool Source memory pool. + * + * @param[in] dst_memory_pool Destination memory pool. + * + * @param[out] result Pointer to a memory location where the result of the query + * is stored. Must not be NULL. If buffers currently located in @p + * src_memory_pool can be relocated to @p dst_memory_pool, the result is + * true. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL One of the memory pools is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API + hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool, + hsa_amd_memory_pool_t dst_memory_pool, + bool* result); + +/** + * @brief Relocate a buffer to a new memory pool. + * + * @details When a buffer is migrated, its virtual address remains the same but + * its physical contents are moved to the indicated memory pool. + * + * After migration, only the agent associated with the destination pool will have access. + * + * The caller is also responsible for ensuring that the allocation in the + * source memory pool where the buffer is currently located can be migrated to the + * specified destination memory pool (using ::hsa_amd_memory_pool_can_migrate returns a value of true + * for the source and destination memory pools), otherwise behavior is undefined. + * + * The caller must ensure that the buffer is not accessed while it is migrated. + * + * @param[in] ptr Buffer to be relocated. The buffer must have been released to system + * prior to call this API. The buffer will be released to system upon completion. + * + * @param[in] memory_pool Memory pool where to place the buffer. + * + * @param[in] flags A bit-field that is used to specify migration + * information. Must be zero. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The destination memory pool is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p flags is not 0. + */ +hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr, + hsa_amd_memory_pool_t memory_pool, + uint32_t flags); + +/** + * + * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and + * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously + * locked memory, then the overlap area is kept locked (i.e multiple mappings are permitted). In + * this case, the same input @p host_ptr may give different locked @p agent_ptr and when it does, + * they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent). + * Accesses to @p agent_ptr are coarse grained. + * + * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator. + * + * @param[in] size The size to be locked. + * + * @param[in] agents Array of agent handle to gain access to the @p host_ptr. + * If this parameter is NULL and the @p num_agent is 0, all agents + * in the platform will gain access to the @p host_ptr. + * + * @param[out] agent_ptr Pointer to the location where to store the new address. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or + * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents + * is NULL but @p num_agent is not 0. + */ +hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, + hsa_agent_t* agents, int num_agent, + void** agent_ptr); + +/** + * + * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and + * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously + * locked memory, then the overlap area is kept locked (i.e. multiple mappings are permitted). + * In this case, the same input @p host_ptr may give different locked @p agent_ptr and when it + * does, they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent). + * Acesses to the memory via @p agent_ptr have the same access properties as memory allocated from + * @p pool as determined by ::hsa_amd_memory_pool_get_info and ::hsa_amd_agent_memory_pool_get_info + * (ex. coarse/fine grain, platform atomic support, link info). Physical composition and placement + * of the memory (ex. page size, NUMA binding) is not changed. + * + * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator. + * + * @param[in] size The size to be locked. + * + * @param[in] agents Array of agent handle to gain access to the @p host_ptr. + * If this parameter is NULL and the @p num_agent is 0, all agents + * in the platform will gain access to the @p host_ptr. + * + * @param[in] pool Global memory pool owned by a CPU agent. + * + * @param[in] flags A bit-field that is used to specify allocation + * directives. Reserved parameter, must be 0. + * + * @param[out] agent_ptr Pointer to the location where to store the new address. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is + * invalid or can not access @p pool. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL @p pool is invalid or not owned + * by a CPU agent. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or + * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents + * is NULL but @p num_agent is not 0 or flags is not 0. + */ +hsa_status_t HSA_API hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_t* agents, + int num_agent, hsa_amd_memory_pool_t pool, + uint32_t flags, void** agent_ptr); + +/** + * + * @brief Unpin the host pointer previously pinned via ::hsa_amd_memory_lock or + * ::hsa_amd_memory_lock_to_pool. + * + * @details The behavior is undefined if the host pointer being unpinned does not + * match previous pinned address or if the host pointer was already deallocated. + * + * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator that was + * pinned previously via ::hsa_amd_memory_lock or ::hsa_amd_memory_lock_to_pool. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + */ +hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr); + +/** + * @brief Sets the first @p count of uint32_t of the block of memory pointed by + * @p ptr to the specified @p value. + * + * @param[in] ptr Pointer to the block of memory to fill. + * + * @param[in] value Value to be set. + * + * @param[in] count Number of uint32_t element to be set to the value. + * + * @retval HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or + * not 4 bytes aligned + * + * @retval HSA_STATUS_ERROR_INVALID_ALLOCATION if the given memory + * region was not allocated with HSA runtime APIs. + * + */ +hsa_status_t HSA_API + hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count); + +/** + * @brief Maps an interop object into the HSA flat address space and establishes + * memory residency. The metadata pointer is valid during the lifetime of the + * map (until hsa_amd_interop_unmap_buffer is called). + * Multiple calls to hsa_amd_interop_map_buffer with the same interop_handle + * result in multiple mappings with potentially different addresses and + * different metadata pointers. Concurrent operations on these addresses are + * not coherent. Memory must be fenced to system scope to ensure consistency, + * between mappings and with any views of this buffer in the originating + * software stack. + * + * @param[in] num_agents Number of agents which require access to the memory + * + * @param[in] agents List of accessing agents. + * + * @param[in] interop_handle Handle of interop buffer (dmabuf handle in Linux) + * + * @param [in] flags Reserved, must be 0 + * + * @param[out] size Size in bytes of the mapped object + * + * @param[out] ptr Base address of the mapped object + * + * @param[out] metadata_size Size of metadata in bytes, may be NULL + * + * @param[out] metadata Pointer to metadata, may be NULL + * + * @retval HSA_STATUS_SUCCESS if successfully mapped + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT all other errors + */ +hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents, + hsa_agent_t* agents, + int interop_handle, + uint32_t flags, + size_t* size, + void** ptr, + size_t* metadata_size, + const void** metadata); + +/** + * @brief Removes a previously mapped interop object from HSA's flat address space. + * Ends lifetime for the mapping's associated metadata pointer. + */ +hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr); + +/** + * @brief Denotes the type of memory in a pointer info query. + */ +typedef enum { + /* + Memory is not known to the HSA driver. Unallocated or unlocked system memory. + */ + HSA_EXT_POINTER_TYPE_UNKNOWN = 0, + /* + Memory was allocated with an HSA memory allocator. + */ + HSA_EXT_POINTER_TYPE_HSA = 1, + /* + System memory which has been locked for use with an HSA agent. + + Memory of this type is normal malloc'd memory and is always accessible to + the CPU. Pointer info queries may not include CPU agents in the accessible + agents list as the CPU has implicit access. + */ + HSA_EXT_POINTER_TYPE_LOCKED = 2, + /* + Memory originated in a graphics component and is shared with ROCr. + */ + HSA_EXT_POINTER_TYPE_GRAPHICS = 3, + /* + Memory has been shared with the local process via ROCr IPC APIs. + */ + HSA_EXT_POINTER_TYPE_IPC = 4, + /* + No backend memory but virtual address + */ + HSA_EXT_POINTER_TYPE_RESERVED_ADDR = 5, + /* + Memory was allocated with an HSA virtual memory allocator + */ + HSA_EXT_POINTER_TYPE_HSA_VMEM = 6 +} hsa_amd_pointer_type_t; + +/** + * @brief Describes a memory allocation known to ROCr. + * Within a ROCr major version this structure can only grow. + */ +typedef struct hsa_amd_pointer_info_s { + /* + Size in bytes of this structure. Used for version control within a major ROCr + revision. Set to sizeof(hsa_amd_pointer_t) prior to calling + hsa_amd_pointer_info. If the runtime supports an older version of pointer + info then size will be smaller on return. Members starting after the return + value of size will not be updated by hsa_amd_pointer_info. + */ + uint32_t size; + /* + The type of allocation referenced. + */ + hsa_amd_pointer_type_t type; + /* + Base address at which non-host agents may access the allocation. This field is + not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + void* agentBaseAddress; + /* + Base address at which the host agent may access the allocation. This field is + not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + void* hostBaseAddress; + /* + Size of the allocation. This field is not meaningful if the type of the allocation + is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + size_t sizeInBytes; + /* + Application provided value. This field is not meaningful if the type of the + allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + void* userData; + /* + Reports an agent which "owns" (ie has preferred access to) the pool in which the + allocation was + made. When multiple agents share equal access to a pool (ex: multiple CPU agents, or multi-die + GPU boards) any such agent may be returned. This field is not meaningful if + the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN or if this agent is not available in + this process, for e.g if this agent is masked using ROCR_VISIBLE_DEVICES. + */ + hsa_agent_t agentOwner; + /* + Contains a bitfield of hsa_amd_memory_pool_global_flag_t values. + Reports the effective global flags bitmask for the allocation. This field is not + meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + uint32_t global_flags; + + /* + Set to true if this allocation was registered with the underlying driver + This field is not meaningful if the type of the allocation is + HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + bool registered; +} hsa_amd_pointer_info_t; + +/** + * @brief Retrieves information about the allocation referenced by the given + * pointer. Optionally returns the number and list of agents which can + * directly access the allocation. In case this virtual address is unknown, the + * pointer type returned will be HSA_EXT_POINTER_TYPE_UNKNOWN and the only fields + * that are valid after hsa_amd_pointer_info returns are size and type. + * + * @param[in] ptr Pointer which references the allocation to retrieve info for. + * + * @param[in, out] info Pointer to structure to be filled with allocation info. + * Data member size must be set to the size of the structure prior to calling + * hsa_amd_pointer_info. On return size will be set to the size of the + * pointer info structure supported by the runtime, if smaller. Members + * beyond the returned value of size will not be updated by the API. + * Must not be NULL. + * + * @param[in] alloc Function pointer to an allocator used to allocate the + * @p accessible array. If NULL @p accessible will not be returned. + * + * @param[out] num_agents_accessible Recieves the count of agents in + * @p accessible. If NULL @p accessible will not be returned. + * + * @param[out] accessible Recieves a pointer to the array, allocated by @p alloc, + * holding the list of agents which may directly access the allocation. + * May be NULL. + * + * @retval HSA_STATUS_SUCCESS Info retrieved successfully + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT NULL in @p ptr or @p info. + */ +hsa_status_t HSA_API hsa_amd_pointer_info(const void* ptr, + hsa_amd_pointer_info_t* info, + void* (*alloc)(size_t), + uint32_t* num_agents_accessible, + hsa_agent_t** accessible); + +/** + * @brief Associates an arbitrary pointer with an allocation known to ROCr. + * The pointer can be fetched by hsa_amd_pointer_info in the userData field. + * + * @param[in] ptr Pointer to the first byte of an allocation known to ROCr + * with which to associate @p userdata. + * + * @param[in] userdata Abitrary pointer to associate with the allocation. + * + * @retval HSA_STATUS_SUCCESS @p userdata successfully stored. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is not known to ROCr. + */ +hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(const void* ptr, + void* userdata); + +/** + * @brief 256-bit process independent identifier for a ROCr shared memory + * allocation. + */ +typedef struct hsa_amd_ipc_memory_s { + uint32_t handle[8]; +} hsa_amd_ipc_memory_t; + +/** + * @brief Prepares an allocation for interprocess sharing and creates a + * handle of type hsa_amd_ipc_memory_t uniquely identifying the allocation. A + * handle is valid while the allocation it references remains accessible in + * any process. In general applications should confirm that a shared memory + * region has been attached (via hsa_amd_ipc_memory_attach) in the remote + * process prior to releasing that memory in the local process. + * Repeated calls for the same allocation may, but are not required to, return + * unique handles. The allocation needs to be on memory on an agent of type + * HSA_DEVICE_TYPE_GPU. + * + * @param[in] ptr Pointer to device memory allocated via ROCr APIs to prepare for + * sharing. + * + * @param[in] len Length in bytes of the allocation to share. + * + * @param[out] handle Process independent identifier referencing the shared + * allocation. + * + * @retval HSA_STATUS_SUCCESS allocation is prepared for interprocess sharing. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr does not point to the + * first byte of an allocation made through ROCr, or len is not the full length + * of the allocation or handle is NULL. + */ +hsa_status_t HSA_API hsa_amd_ipc_memory_create(void* ptr, size_t len, + hsa_amd_ipc_memory_t* handle); + +/** + * @brief Imports shared memory into the local process and makes it accessible + * by the given agents. If a shared memory handle is attached multiple times + * in a process each attach may return a different address. Each returned + * address is refcounted and requires a matching number of calls to + * hsa_amd_ipc_memory_detach to release the shared memory mapping. + * + * @param[in] handle Pointer to the identifier for the shared memory. + * + * @param[in] len Length of the shared memory to import. + * Reserved. Must be the full length of the shared allocation in this version. + * + * @param[in] num_agents Count of agents in @p mapping_agents. + * May be zero if all agents are to be allowed access. + * + * @param[in] mapping_agents List of agents to access the shared memory. + * Ignored if @p num_agents is zero. + * + * @param[out] mapped_ptr Recieves a process local pointer to the shared memory. + * + * @retval HSA_STATUS_SUCCESS if memory is successfully imported. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid, @p len is + * incorrect, @p mapped_ptr is NULL, or some agent for which access was + * requested can not access the shared memory. + */ +hsa_status_t HSA_API hsa_amd_ipc_memory_attach( + const hsa_amd_ipc_memory_t* handle, size_t len, + uint32_t num_agents, + const hsa_agent_t* mapping_agents, + void** mapped_ptr); + +/** + * @brief Decrements the reference count for the shared memory mapping and + * releases access to shared memory imported with hsa_amd_ipc_memory_attach. + * + * @param[in] mapped_ptr Pointer to the first byte of a shared allocation + * imported with hsa_amd_ipc_memory_attach. + * + * @retval HSA_STATUS_SUCCESS if @p mapped_ptr was imported with + * hsa_amd_ipc_memory_attach. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p mapped_ptr was not imported + * with hsa_amd_ipc_memory_attach. + */ +hsa_status_t HSA_API hsa_amd_ipc_memory_detach(void* mapped_ptr); + +/** @} */ + +/** \addtogroup status Runtime notifications + * @{ + */ + +/** + * @brief 256-bit process independent identifier for a ROCr IPC signal. + */ +typedef hsa_amd_ipc_memory_t hsa_amd_ipc_signal_t; + +/** + * @brief Obtains an interprocess sharing handle for a signal. The handle is + * valid while the signal it references remains valid in any process. In + * general applications should confirm that the signal has been attached (via + * hsa_amd_ipc_signal_attach) in the remote process prior to destroying that + * signal in the local process. + * Repeated calls for the same signal may, but are not required to, return + * unique handles. + * + * @param[in] signal Signal created with attribute HSA_AMD_SIGNAL_IPC. + * + * @param[out] handle Process independent identifier referencing the shared + * signal. + * + * @retval HSA_STATUS_SUCCESS @p handle is ready to use for interprocess sharing. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is not a valid signal + * created with attribute HSA_AMD_SIGNAL_IPC or handle is NULL. + */ +hsa_status_t HSA_API hsa_amd_ipc_signal_create(hsa_signal_t signal, hsa_amd_ipc_signal_t* handle); + +/** + * @brief Imports an IPC capable signal into the local process. If an IPC + * signal handle is attached multiple times in a process each attach may return + * a different signal handle. Each returned signal handle is refcounted and + * requires a matching number of calls to hsa_signal_destroy to release the + * shared signal. + * + * @param[in] handle Pointer to the identifier for the shared signal. + * + * @param[out] signal Recieves a process local signal handle to the shared signal. + * + * @retval HSA_STATUS_SUCCESS if the signal is successfully imported. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid. + */ +hsa_status_t HSA_API hsa_amd_ipc_signal_attach(const hsa_amd_ipc_signal_t* handle, + hsa_signal_t* signal); + +/** + * @brief GPU system event type. + */ +typedef enum hsa_amd_event_type_s { + /* + AMD GPU memory fault. + */ + HSA_AMD_GPU_MEMORY_FAULT_EVENT = 0, + /* + AMD GPU HW Exception. + */ + HSA_AMD_GPU_HW_EXCEPTION_EVENT, + /* + AMD GPU memory error. + */ + HSA_AMD_GPU_MEMORY_ERROR_EVENT, +} hsa_amd_event_type_t; + +/** + * @brief Flags denoting the cause of a memory fault. + */ +typedef enum { + // Page not present or supervisor privilege. + HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT = 1 << 0, + // Write access to a read-only page. + HSA_AMD_MEMORY_FAULT_READ_ONLY = 1 << 1, + // Execute access to a page marked NX. + HSA_AMD_MEMORY_FAULT_NX = 1 << 2, + // GPU attempted access to a host only page. + HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3, + // DRAM ECC failure. + HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4, + // Can't determine the exact fault address. + HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5, + // SRAM ECC failure (ie registers, no fault address). + HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6, + // GPU reset following unspecified hang. + HSA_AMD_MEMORY_FAULT_HANG = 1U << 31 +} hsa_amd_memory_fault_reason_t; + +/** + * @brief AMD GPU memory fault event data. + */ +typedef struct hsa_amd_gpu_memory_fault_info_s { + /* + The agent where the memory fault occurred. + */ + hsa_agent_t agent; + /* + Virtual address accessed. + */ + uint64_t virtual_address; + /* + Bit field encoding the memory access failure reasons. There could be multiple bits set + for one fault. Bits are defined in hsa_amd_memory_fault_reason_t. + */ + uint32_t fault_reason_mask; +} hsa_amd_gpu_memory_fault_info_t; + +/** + * @brief Flags denoting the cause of a memory error. + */ +typedef enum { + // Memory was in use by low-level HW component and cannot be released + HSA_AMD_MEMORY_ERROR_MEMORY_IN_USE = (1 << 0), +} hsa_amd_memory_error_reason_t; + +/** + * @brief AMD GPU memory error event data. + */ +typedef struct hsa_amd_gpu_memory_error_info_s { + /* + The agent where the memory error occurred. + */ + hsa_agent_t agent; + /* + Virtual address involved. + */ + uint64_t virtual_address; + /* + Bit field encoding the memory error failure reasons. There could be multiple bits set + for one error. Bits are defined in hsa_amd_memory_error_reason_t. + */ + uint32_t error_reason_mask; +} hsa_amd_gpu_memory_error_info_t; + +/** + * @brief Flags denoting the type of a HW exception + */ +typedef enum { + // Unused for now + HSA_AMD_HW_EXCEPTION_RESET_TYPE_OTHER = 1 << 0, +} hsa_amd_hw_exception_reset_type_t; + +/** + * @brief Flags denoting the cause of a HW exception + */ +typedef enum { + // GPU Hang + HSA_AMD_HW_EXCEPTION_CAUSE_GPU_HANG = 1 << 0, + // SRAM ECC + HSA_AMD_HW_EXCEPTION_CAUSE_ECC = 1 << 1, +} hsa_amd_hw_exception_reset_cause_t; + +/** + * @brief AMD GPU HW Exception event data. + */ +typedef struct hsa_amd_gpu_hw_exception_info_s { + /* + The agent where the HW exception occurred. + */ + hsa_agent_t agent; + hsa_amd_hw_exception_reset_type_t reset_type; + hsa_amd_hw_exception_reset_cause_t reset_cause; +} hsa_amd_gpu_hw_exception_info_t; + +/** + * @brief AMD GPU event data passed to event handler. + */ +typedef struct hsa_amd_event_s { + /* + The event type. + */ + hsa_amd_event_type_t event_type; + union { + /* + The memory fault info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_FAULT_EVENT. + */ + hsa_amd_gpu_memory_fault_info_t memory_fault; + /* + The memory fault info, only valid when @p event_type is HSA_AMD_GPU_HW_EXCEPTION_EVENT. + */ + hsa_amd_gpu_hw_exception_info_t hw_exception; + /* + The memory error info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_ERROR_EVENT. + */ + hsa_amd_gpu_memory_error_info_t memory_error; + }; +} hsa_amd_event_t; + +typedef hsa_status_t (*hsa_amd_system_event_callback_t)(const hsa_amd_event_t* event, void* data); + +/** + * @brief Register AMD GPU event handler. + * + * @param[in] callback Callback to be invoked when an event is triggered. + * The HSA runtime passes two arguments to the callback: @p event + * is defined per event by the HSA runtime, and @p data is the user data. + * + * @param[in] data User data that is passed to @p callback. May be NULL. + * + * @retval HSA_STATUS_SUCCESS The handler has been registered successfully. + * + * @retval HSA_STATUS_ERROR An event handler has already been registered. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p event is invalid. + */ +hsa_status_t HSA_API hsa_amd_register_system_event_handler(hsa_amd_system_event_callback_t callback, + void* data); + +/** @} */ + +/** \addtogroup queue Queues + * @{ + */ + +/** + * @brief Per-queue dispatch and wavefront scheduling priority. + */ +typedef enum hsa_amd_queue_priority_s { + /* + Below normal/high priority compute and all graphics + */ + HSA_AMD_QUEUE_PRIORITY_LOW = 0, + /* + Above low priority compute, below high priority compute and all graphics + */ + HSA_AMD_QUEUE_PRIORITY_NORMAL = 1, + /* + Above low/normal priority compute and all graphics + */ + HSA_AMD_QUEUE_PRIORITY_HIGH = 2, +} hsa_amd_queue_priority_t; + +/** + * @brief Modifies the dispatch and wavefront scheduling prioirty for a + * given compute queue. The default is HSA_AMD_QUEUE_PRIORITY_NORMAL. + * + * @param[in] queue Compute queue to apply new priority to. + * + * @param[in] priority Priority to associate with queue. + * + * @retval HSA_STATUS_SUCCESS if priority was changed successfully. + * + * @retval HSA_STATUS_ERROR_INVALID_QUEUE if queue is not a valid + * compute queue handle. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if priority is not a valid + * value from hsa_amd_queue_priority_t. + */ +hsa_status_t HSA_API hsa_amd_queue_set_priority(hsa_queue_t* queue, + hsa_amd_queue_priority_t priority); + +/** + * @brief Queue creation attributes. + */ +typedef enum { + /** + * The queue's packet buffer and queue descriptor struct should be + * allocated in system memory (default). Mutually exclusive with + * HSA_AMD_QUEUE_CREATE_DEVICE_MEM_RING_BUF and + * HSA_AMD_QUEUE_CREATE_DEVICE_MEM_QUEUE_DESCRIPTOR. + */ + HSA_AMD_QUEUE_CREATE_SYSTEM_MEM = 0, + /** + * The queue's packet buffer should be allocated in the agent's + * fine-grain device memory region. + */ + HSA_AMD_QUEUE_CREATE_DEVICE_MEM_RING_BUF = (1 << 0), + /** + * The queue desciptor struct should be allocated in the agent's + * fine-grain device memory region. Not supported for devices + * connected via PCIe because the CPU's atomic read-modify-write + * operations cannot be promoted to PCIe atomic read-modify-write + * operations. + */ + HSA_AMD_QUEUE_CREATE_DEVICE_MEM_QUEUE_DESCRIPTOR = (1 << 1), +} hsa_amd_queue_create_flag_t; + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Deallocation notifier function type. + */ +typedef void (*hsa_amd_deallocation_callback_t)(void* ptr, void* user_data); + +/** + * @brief Registers a deallocation notifier monitoring for release of agent + * accessible address @p ptr. If successful, @p callback will be invoked when + * @p ptr is removed from accessibility from all agents. + * + * Notification callbacks are automatically deregistered when they are invoked. + * + * Note: The current version supports notifications of address release + * originating from ::hsa_amd_memory_pool_free. Support for other address + * release APIs will follow. + * + * @param[in] ptr Agent accessible address to monitor for deallocation. Passed + * to @p callback. + * + * @param[in] callback Notifier to be invoked when @p ptr is released from + * agent accessibility. + * + * @param[in] user_data User provided value passed to @p callback. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The notifier registered successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p ptr does not refer to a valid agent accessible + * address. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL or @p ptr is NULL. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + */ +hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr, + hsa_amd_deallocation_callback_t callback, + void* user_data); + +/** + * @brief Removes a deallocation notifier previously registered with + * ::hsa_amd_register_deallocation_callback. Arguments must be identical to + * those given in ::hsa_amd_register_deallocation_callback. + * + * @param[in] ptr Agent accessible address which was monitored for deallocation. + * + * @param[in] callback Notifier to be removed. + * + * @retval ::HSA_STATUS_SUCCESS The notifier has been removed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The given notifier was not registered. + */ +hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr, + hsa_amd_deallocation_callback_t callback); + +typedef enum hsa_amd_svm_model_s { + /** + * Updates to memory with this attribute conform to HSA memory consistency + * model. + */ + HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0, + /** + * Writes to memory with this attribute can be performed by a single agent + * at a time. + */ + HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1, + /** + * Memory region queried contains subregions with both + * HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED and + * HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED attributes. + * + * This attribute can not be used in hsa_amd_svm_attributes_set. It is a + * possible return from hsa_amd_svm_attributes_get indicating that the query + * region contains both coarse and fine grained memory. + */ + HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE = 2 +} hsa_amd_svm_model_t; + +typedef enum hsa_amd_svm_attribute_s { + // Memory model attribute. + // Type of this attribute is hsa_amd_svm_model_t. + HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0, + // Marks the range read only. This allows multiple physical copies to be + // placed local to each accessing device. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_ONLY = 1, + // Automatic migrations should attempt to keep the memory within the xgmi hive + // containing accessible agents. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2, + // Page granularity to migrate at once. Page granularity is specified as + // log2(page_count). + // Type of this attribute is uint64_t. + HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3, + // Physical location to prefer when automatic migration occurs. + // Set to the null agent handle (handle == 0) to indicate there + // is no preferred location. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4, + // This attribute can not be used in ::hsa_amd_svm_attributes_set (see + // ::hsa_amd_svm_prefetch_async). + // Queries the physical location of most recent prefetch command. + // If the prefetch location has not been set or is not uniform across the + // address range then returned hsa_agent_t::handle will be 0. + // Querying this attribute will return the destination agent of the most + // recent ::hsa_amd_svm_prefetch_async targeting the address range. If + // multiple async prefetches have been issued targeting the region and the + // most recently issued prefetch has completed then the query will return + // the location of the most recently completed prefetch. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5, + // Optimizes with the anticipation that the majority of operations to the + // range will be read operations. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_MOSTLY = 6, + // Allows the execution on GPU. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_GPU_EXEC = 7, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range. Access may incur a page fault + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM + // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range without page faults. Access + // will not incur a page fault and will not cause access based migration. + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if + // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Denies an agent access to the memory range. Access will cause a terminal + // segfault. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202, + // This attribute can not be used in ::hsa_amd_svm_attributes_set. + // Returns the access attribute associated with the agent. + // The agent to query must be set in the attribute value field. + // The attribute enum will be replaced with the agent's current access + // attribute for the address range. + // TODO: Clarify KFD return value for non-uniform access attribute. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203, +} hsa_amd_svm_attribute_t; + +// List type for hsa_amd_svm_attributes_set/get. +typedef struct hsa_amd_svm_attribute_pair_s { + // hsa_amd_svm_attribute_t value. + uint64_t attribute; + // Attribute value. Bit values should be interpreted according to the type + // given in the associated attribute description. + uint64_t value; +} hsa_amd_svm_attribute_pair_t; + +/** + * @brief Sets SVM memory attributes. + * + * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling + * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE + * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM + * memory access by that Agent. + * + * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION + * may not be used with this API. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Gets SVM memory attributes. + * + * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE, + * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and + * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API. + * + * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an + * hsa_agent_t and returns the current access type through its attribute field. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Asynchronously migrates memory to an agent. + * + * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero. + * @p completion_signal will decrement when the migration is complete. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] agent Agent to migrate to. + * + * @param[in] num_dep_signals Number of dependent signals. Can be 0. + * + * @param[in] dep_signals List of signals that must be waited on before the migration + * operation starts. The migration will start after every signal has been observed with + * the value 0. If @p num_dep_signals is 0, this argument is ignored. + * + * @param[in] completion_signal Signal used to indicate completion of the migration + * operation. When the migration operation is finished, the value of the signal is + * decremented. The runtime indicates that an error has occurred during the copy + * operation by setting the value of the completion signal to a negative + * number. If no completion signal is required this handle may be null. + */ +hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + +/** @} */ + +/** \addtogroup profile Profiling + * @{ + */ + +/** + * @brief Acquire Stream Performance Monitor on an agent + * + * Acquire exclusive use of SPM on @p preferred_agent. + * See hsa_amd_spm_set_dest_buffer to provide a destination buffer to KFD to start recording and + * retrieve this data. + * @param[in] preferred_agent Agent on which to acquire SPM + */ +hsa_status_t hsa_amd_spm_acquire(hsa_agent_t preferred_agent); + +/** + * @brief Release Stream Performance Monitor on an agent + * + * Release exclusive use of SPM on @p preferred_agent. This will stop KFD writing SPM data. + * If a destination buffer is set, then data in the destination buffer is available to user + * when this function returns. + * + * @param[in] preferred_agent Agent on which to release SPM + */ +hsa_status_t hsa_amd_spm_release(hsa_agent_t preferred_agent); + +/** + * @brief Set up the current destination user mode buffer for stream performance + * counter data. KFD will start writing SPM data into the destination buffer. KFD will continue + * to copy data into the current destination buffer until any of the following functions are called + * - hsa_amd_spm_release + * - hsa_amd_spm_set_dest_buffer with dest set to NULL + * - hsa_amd_spm_set_dest_buffer with dest set to a new buffer + * + * if @p timeout is non-0, the call will wait for up to @p timeout ms for the previous + * buffer to be filled. If previous buffer to be filled before timeout, the @p timeout + * will be updated value with the time remaining. If the timeout is exceeded, the function + * copies any partial data available into the previous user buffer and returns success. + * User should not access destination data while KFD is copying data. + * If the previous destination buffer was full, then @p is_data_loss flag is set. + * @p dest is CPU accessible memory. It could be malloc'ed memory or host allocated memory + * + * @param[in] preferred_agent Agent on which to set the dest buffer + * + * @param[in] size_in_bytes size of the buffer + * + * @param[in,out] timeout timeout in milliseconds + * + * @param[out] size_copied number of bytes copied + * + * @param[in] dest destination address. Set to NULL to stop copy on previous buffer + * + * @param[out] is_data_loss true is data was lost + */ +hsa_status_t hsa_amd_spm_set_dest_buffer(hsa_agent_t preferred_agent, size_t size_in_bytes, + uint32_t* timeout, uint32_t* size_copied, void* dest, + bool* is_data_loss); + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Older version of export dmabuf + * + * This is the same as calling the v2 version of export dmabuf with the + * flags argument set to HSA_AMD_DMABUF_MAPPING_TYPE_NONE. + * + * @param[in] ptr Pointer to the allocation being exported. + * + * @param[in] size Size in bytes to export following @p ptr. The entire range + * being exported must be contained within a single allocation. + * + * @param[out] dmabuf Pointer to a dma-buf file descriptor holding a reference to the + * allocation. Contents will not be altered in the event of failure. + * + * @param[out] offset Offset in bytes into the memory referenced by the dma-buf + * object at which @p ptr resides. Contents will not be altered in the event + * of failure. + * + * @retval ::HSA_STATUS_SUCCESS Export completed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT One or more arguments is NULL. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The address range described by + * @p ptr and @p size are not contained within a single allocation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The allocation described by @p ptr + * and @p size was allocated on a device which can not export memory. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The return file descriptor, + * @p dmabuf, could not be created. + */ +hsa_status_t hsa_amd_portable_export_dmabuf(const void* ptr, size_t size, int* dmabuf, + uint64_t* offset); + + /** + * @brief Obtains an OS specific, vendor neutral, handle to a memory allocation. + * + * Obtains an OS specific handle to GPU agent memory. The memory must be part + * of a single allocation from an hsa_amd_memory_pool_t exposed by a GPU Agent. + * The handle may be used with other APIs (e.g. Vulkan) to obtain shared access + * to the allocation. + * + * Shared access to the memory is not guaranteed to be fine grain coherent even + * if the allocation exported is from a fine grain pool. The shared memory + * consistency model will be no stronger than the model exported from, consult + * the importing API to determine the final consistency model. + * + * The allocation's memory remains valid as long as the handle and any mapping + * of the handle remains valid. When the handle and all mappings are closed + * the backing memory will be released for reuse. + * + * @param[in] ptr Pointer to the allocation being exported. + * + * @param[in] size Size in bytes to export following @p ptr. The entire range + * being exported must be contained within a single allocation. + * + * @param[out] dmabuf Pointer to a dma-buf file descriptor holding a reference to the + * allocation. Contents will not be altered in the event of failure. + * + * @param[out] offset Offset in bytes into the memory referenced by the dma-buf + * object at which @p ptr resides. Contents will not be altered in the event + * of failure. + * + * @param[in] flags Bitmask of hsa_amd_dma_buf_mapping_type_t flags. + * + * @retval ::HSA_STATUS_SUCCESS Export completed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT One or more arguments is NULL. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The address range described by + * @p ptr and @p size are not contained within a single allocation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The allocation described by @p ptr + * and @p size was allocated on a device which can not export memory. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The return file descriptor, + * @p dmabuf, could not be created. + */ +hsa_status_t hsa_amd_portable_export_dmabuf_v2(const void* ptr, size_t size, + int* dmabuf, uint64_t* offset, uint64_t flags); + +/** + * @brief Closes an OS specific, vendor neutral, handle to a memory allocation. + * + * Closes an OS specific handle to GPU agent memory. + * + * Applications should close a handle after imports are complete. The handle + * is not required to remain open for the lifetime of imported mappings. The + * referenced allocation will remain valid until all handles and mappings + * are closed. + * + * @param[in] dmabuf Handle to be closed. + * + * @retval ::HSA_STATUS_SUCCESS Handle closed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_RESOURCE_FREE A generic error was encountered + * when closing the handle. The handle may have been closed already or an + * async IO error may have occured. + */ +hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf); + +typedef enum hsa_amd_vmem_address_reserve_flag_s { + // Only reserve a VA range without registering it to the underlying driver + HSA_AMD_VMEM_ADDRESS_NO_REGISTER = (1UL << 0), +} hsa_amd_vmem_address_reserve_flag_t; + +/** + * @brief Allocate a reserved address range + * + * Reserve a virtual address range. The size must be a multiple of the system page size. + * If it is not possible to allocate the address specified by @p address, then @p va will be + * a different address range. + * Address range should be released by calling hsa_amd_vmem_address_free. + * + * @param[out] va virtual address allocated + * @param[in] size of address range requested + * @param[in] address requested + * @param[in] flags optional hsa_amd_vmem_address_reserve_flag_t + * + * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address + * range of this size. + * + * Note that this API will be deprecated in a future release and replaced by + * hsa_amd_vmem_address_reserve_align + */ +hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t address, + uint64_t flags); + +/** + * @brief Allocate a reserved address range + * + * Reserve a virtual address range. The size must be a multiple of the system page size. + * If it is not possible to allocate the address specified by @p address, then @p va will be + * a different address range. + * Address range should be released by calling hsa_amd_vmem_address_free. + * + * @param[out] va virtual address allocated + * @param[in] size of address range requested + * @param[in] address requested + * @param[in] alignment requested. 0 for default. Must be >= page-size and a power of 2 + * @param[in] flags optional hsa_amd_vmem_address_reserve_flag_t + * + * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address + * range of this size. + */ +hsa_status_t hsa_amd_vmem_address_reserve_align(void** va, size_t size, uint64_t address, + uint64_t alignment, uint64_t flags); + +/** + * @brief Free a reserved address range + * + * Free a previously allocated address range. The size must match the size of a previously + * allocated address range. + * + * @param[out] va virtual address to be freed + * @param[in] size of address range + * + * @retval ::HSA_STATUS_SUCCESS Address range released successfully + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid va specified + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid size specified + * @retval ::HSA_STATUS_ERROR_RESOURCE_FREE Address range is still in use + * @retval ::HSA_STATUS_ERROR Internal unexpected error + */ +hsa_status_t hsa_amd_vmem_address_free(void* va, size_t size); + +/** + * @brief Struct containing an opaque handle to a memory allocation handle + */ +typedef struct hsa_amd_vmem_alloc_handle_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_amd_vmem_alloc_handle_t; + +typedef enum { + MEMORY_TYPE_NONE, + MEMORY_TYPE_PINNED, +} hsa_amd_memory_type_t; + +/** + * @brief Create a virtual memory handle + * + * Create a virtual memory handle within this pool + * @p size must be a aligned to allocation granule size for this memory pool, see + * HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE + * To minimize internal memory fragmentation, align the size to the recommended allocation granule + * size, see HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE + * + * @param[in] pool memory to use + * @param[in] size of the memory allocation + * @param[in] type of memory + * @param[in] flags - currently unsupported + * @param[out] memory_handle - handle for the allocation + * + * @retval ::HSA_STATUS_SUCCESS memory allocated successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid arguments + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION This memory pool does not support allocations + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate this memory + */ +hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size, + hsa_amd_memory_type_t type, uint64_t flags, + hsa_amd_vmem_alloc_handle_t* memory_handle); + +/** + * @brief Release a virtual memory handle + * + * @param[in] memory handle that was previously allocated + * + * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle + */ +hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_handle); + +/** + * @brief Map a virtual memory handle + * + * Map a virtual memory handle to a reserved address range. The virtual address requested must be + * within a previously reserved address range. @p va and (@p va + size) must be must be within + * (va + size) of the previous allocated address range. + * @p size must be equal to size of the @p memory_handle + * hsa_amd_vmem_set_access needs to be called to make the memory accessible to specific agents + * + * @param[in] va virtual address range where memory will be mapped + * @param[in] size of memory mapping + * @param[in] in_offset offset into memory. Currently unsupported + * @param[in] memory_handle virtual memory handle to be mapped + * @param[in] flags. Currently unsupported + * + * @retval ::HSA_STATUS_SUCCESS Memory mapped successfully + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT va, size or memory_handle are invalid + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_map(void* va, size_t size, size_t in_offset, + hsa_amd_vmem_alloc_handle_t memory_handle, uint64_t flags); + +/** + * @brief Unmap a virtual memory handle + * + * Unmap previously mapped virtual address range + * + * @param[in] va virtual address range where memory will be mapped + * @param[in] size of memory mapping + * + * @retval ::HSA_STATUS_SUCCESS Memory backing unmapped successfully + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION memory_handle is invalid + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT size is invalid + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_unmap(void* va, size_t size); + +typedef struct hsa_amd_memory_access_desc_s { + hsa_access_permission_t permissions; + hsa_agent_t agent_handle; +} hsa_amd_memory_access_desc_t; + +/** + * @brief Make a memory mapping accessible + * + * Make previously mapped virtual address accessible to specific agents. @p size must be equal to + * size of previously mapped virtual memory handle. + * Calling hsa_amd_vmem_set_access multiple times on the same @p va: + * - Will overwrite permissions for agents specified in @p desc + * - Will leave permissions unchanged for agents not specified in @p desc + * + * @param[in] va previously mapped virtual address + * @param[in] size of memory mapping + * @param[in] desc list of access permissions for each agent + * @param[in] desc_cnt number of elements in desc + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT va, size or memory_handle are invalid + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION memory_handle is invalid + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT Invalid agent in desc + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size, + const hsa_amd_memory_access_desc_t* desc, + size_t desc_cnt); + +/** + * @brief Get current access permissions for memory mapping + * + * Get access permissions for memory mapping for specific agent. + * + * @param[in] va previously mapped virtual address + * @param[in] perms current permissions + * @param[in] agent_handle agent + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT Invalid agent + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION va is not mapped or permissions never set for this + * agent + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms, + hsa_agent_t agent_handle); + +/** + * @brief Get an exportable shareable handle + * + * Get an exportable shareable handle for a memory_handle. This shareabl handle can then be used to + * re-create a virtual memory handle using hsa_amd_vmem_import_shareable_handle. The shareable + * handle can be transferred using mechanisms that support posix file descriptors Once all shareable + * handles are closed, the memory_handle is released. + * + * @param[out] dmabuf_fd shareable handle + * @param[in] handle previously allocated virtual memory handle + * @param[in] flags Currently unsupported + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Out of resources + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd, + hsa_amd_vmem_alloc_handle_t handle, + uint64_t flags); +/** + * @brief Import a shareable handle + * + * Import a shareable handle for a memory handle. Importing a shareable handle that has been closed + * and released results in undefined behavior. + * + * @param[in] dmabuf_fd shareable handle exported with hsa_amd_vmem_export_shareable_handle + * @param[out] handle virtual memory handle + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Out of resources + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd, + hsa_amd_vmem_alloc_handle_t* handle); + +/** + * @brief Returns memory handle for mapped memory + * + * Return a memory handle for previously mapped memory. The handle will be the same value of handle + * used to map the memory. The returned handle must be released with corresponding number of calls + * to hsa_amd_vmem_handle_release. + * + * @param[out] memory_handle memory handle for this mapped address + * @param[in] mapped address + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid address + */ +hsa_status_t hsa_amd_vmem_retain_alloc_handle(hsa_amd_vmem_alloc_handle_t* memory_handle, + void* addr); + +/** + * @brief Returns the current allocation properties of a handle + * + * Returns the allocation properties of an existing handle + * + * @param[in] memory_handle memory handle to be queried + * @param[out] pool memory pool that owns this handle + * @param[out] memory type + + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle + */ +hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle( + hsa_amd_vmem_alloc_handle_t memory_handle, hsa_amd_memory_pool_t* pool, + hsa_amd_memory_type_t* type); + +/** @} */ + +/** \addtogroup queue Queues + * @{ + */ + +/** + * @brief Set the asynchronous scratch limit threshold on all the queues for this agent. + * Dispatches that are enqueued on HW queues on this agent that are smaller than threshold will not + * result in a scratch use-once method. + * + * Increasing this threshold will only increase the internal limit and not cause immediate allocation + * of additional scratch memory. Decreasing this threshold will result in a release in scratch memory + * on queues where the current amount of allocated scratch exceeds the new limit. + * + * If this API call would result in a release in scratch memory and there are dispatches that are + * currently using scratch memory on this agent, this will result into a blocking call until the + * current dispatches are completed. + * + * This API is only supported on devices that support asynchronous scratch reclaim. + * + * @param[in] agent A valid agent. + * + * @param[in] threshold Threshold size in bytes + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT This agent does not support asynchronous scratch + * reclaim + */ +hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, size_t threshold); + +typedef enum { + /* + * Returns the agent that owns the underlying HW queue. + * The type of this attribute is hsa_agent_t. + */ + HSA_AMD_QUEUE_INFO_AGENT, + /* + * Returns the doorbell ID of the completion signal of the queue + * The type of this attribute is uint64_t. + */ + HSA_AMD_QUEUE_INFO_DOORBELL_ID, +} hsa_queue_info_attribute_t; + +hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute, + void* value); + +typedef struct hsa_amd_ais_file_handle_s { + /* + * file handle for AIS read & write. Linux will use fd. + * pad is keep the size consistent accross different platforms. + */ + union { + void* handle; + int fd; + uint8_t pad[8]; + }; +} hsa_amd_ais_file_handle_t; + +/** + * @brief Write data from device memory to a file + * + * Writes data from device memory buffer to a file at the specified offset. + * The device memory pointer must be accessible from the host and point to + * a valid allocation. + * + * EXPERIMENTAL: AIS read and write calls are currently in experimental phase and + * APIs may be modified + * + * @param[in] handle Handle of the file to write to. + * + * @param[in] devicePtr Device memory buffer pointer containing data to write. + * + * @param[in] size Size in bytes of the data to write. + * + * @param[in] file_offset Offset in bytes into the file where data will be written. + * + * @param[in/out] size_copied Actual number of bytes copied + * + * @param[in/out] status Additional status if any + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fd is invalid, @p devicePtr + * is NULL, or @p size is 0. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p devicePtr does not refer to + * a valid allocation. + * + * @retval ::HSA_STATUS_ERROR An error occurred during the write operation. + */ +hsa_status_t HSA_API hsa_amd_ais_file_write(hsa_amd_ais_file_handle_t handle, void *devicePtr, + uint64_t size, int64_t file_offset, + uint64_t *size_copied, int32_t *status); + +/** + * @brief Read data from a file to device memory + * + * Reads data from a file at the specified offset into a device memory buffer. + * The device memory pointer must be accessible from the host and point to + * a valid allocation. + * + * EXPERIMENTAL: AIS read and write calls are currently in experimental phase and + * APIs may be modified + * @param[in] hanlde Handle of the file to read from. + * + * @param[in] devicePtr Device memory buffer pointer to store the read data. + * + * @param[in] size Size in bytes of the data to read. + * + * @param[in] file_offset Offset in bytes into the file where data will be read from. + * + * @param[in/out] size_copied Actual number of bytes copied + * + * @param[in/out] status Additional status if any + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fd is invalid, @p devicePtr + * is NULL, or @p size is 0. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p devicePtr does not refer to + * a valid allocation. + * + * @retval ::HSA_STATUS_ERROR An error occurred during the read operation. + */ +hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, void *devicePtr, + uint64_t size, int64_t file_offset, + uint64_t *size_copied, int32_t *status); + +/** + * @brief logging types + */ +typedef enum hsa_amd_log_flag_s { + /* Log AQL packets internally enqueued by ROCr */ + HSA_AMD_LOG_FLAG_BLIT_KERNEL_PKTS = 0, + HSA_AMD_LOG_FLAG_AQL = 0, + /* Log SDMA packets */ + HSA_AMD_LOG_FLAG_SDMA = 1, + /* Log INFO */ + HSA_AMD_LOG_FLAG_INFO = 2, +} hsa_amd_log_flag_t; + +/** + * @brief Enable logging via external file + * If this function is called multiple times, the last call to this function will overwrite the + * previous @p flags and @p file. + * + * @param[in] flags is used to filter types of logging. Type is uint8_t[8]. + * Can be set using the hsa_flag_set64 macro. Setting @p flags to 0 will disable logging. + * @param[in] file file stream to output logging. If file is NULL, prints are sent to stderr. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + */ +hsa_status_t hsa_amd_enable_logging(uint8_t* flags, void* file); + +/** @} */ + +#ifdef __cplusplus +} // end extern "C" block +#endif + +#endif // header guard diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_finalize.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_finalize.h new file mode 100644 index 0000000000..94c4582055 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_finalize.h @@ -0,0 +1,531 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ +#define HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ + +#include "hsa.h" + +#undef HSA_API +#ifdef HSA_EXPORT_FINALIZER +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +struct BrigModuleHeader; +typedef struct BrigModuleHeader* BrigModule_t; + +/** \defgroup ext-alt-finalizer-extensions Finalization Extensions + * @{ + */ + +/** + * @brief Enumeration constants added to ::hsa_status_t by this extension. + */ +enum { + /** + * The HSAIL program is invalid. + */ + HSA_EXT_STATUS_ERROR_INVALID_PROGRAM = 0x2000, + /** + * The HSAIL module is invalid. + */ + HSA_EXT_STATUS_ERROR_INVALID_MODULE = 0x2001, + /** + * Machine model or profile of the HSAIL module do not match the machine model + * or profile of the HSAIL program. + */ + HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE = 0x2002, + /** + * The HSAIL module is already a part of the HSAIL program. + */ + HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED = 0x2003, + /** + * Compatibility mismatch between symbol declaration and symbol definition. + */ + HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH = 0x2004, + /** + * The finalization encountered an error while finalizing a kernel or + * indirect function. + */ + HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED = 0x2005, + /** + * Mismatch between a directive in the control directive structure and in + * the HSAIL kernel. + */ + HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH = 0x2006 +}; + +/** @} */ + +/** \defgroup ext-alt-finalizer-program Finalization Program + * @{ + */ + +/** + * @brief HSAIL (BRIG) module. The HSA Programmer's Reference Manual contains + * the definition of the BrigModule_t type. + */ +typedef BrigModule_t hsa_ext_module_t; + +/** + * @brief An opaque handle to a HSAIL program, which groups a set of HSAIL + * modules that collectively define functions and variables used by kernels and + * indirect functions. + */ +typedef struct hsa_ext_program_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_ext_program_t; + +/** + * @brief Create an empty HSAIL program. + * + * @param[in] machine_model Machine model used in the HSAIL program. + * + * @param[in] profile Profile used in the HSAIL program. + * + * @param[in] default_float_rounding_mode Default float rounding mode used in + * the HSAIL program. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[out] program Memory location where the HSA runtime stores the newly + * created HSAIL program handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p machine_model is invalid, + * @p profile is invalid, @p default_float_rounding_mode is invalid, or + * @p program is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_create( + hsa_machine_model_t machine_model, + hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, + hsa_ext_program_t *program); + +/** + * @brief Destroy a HSAIL program. + * + * @details The HSAIL program handle becomes invalid after it has been + * destroyed. Code object handles produced by ::hsa_ext_program_finalize are + * still valid after the HSAIL program has been destroyed, and can be used as + * intended. Resources allocated outside and associated with the HSAIL program + * (such as HSAIL modules that are added to the HSAIL program) can be released + * after the finalization program has been destroyed. + * + * @param[in] program HSAIL program. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is + * invalid. + */ +hsa_status_t HSA_API hsa_ext_program_destroy( + hsa_ext_program_t program); + +/** + * @brief Add a HSAIL module to an existing HSAIL program. + * + * @details The HSA runtime does not perform a deep copy of the HSAIL module + * upon addition. Instead, it stores a pointer to the HSAIL module. The + * ownership of the HSAIL module belongs to the application, which must ensure + * that @p module is not released before destroying the HSAIL program. + * + * The HSAIL module is successfully added to the HSAIL program if @p module is + * valid, if all the declarations and definitions for the same symbol are + * compatible, and if @p module specify machine model and profile that matches + * the HSAIL program. + * + * @param[in] program HSAIL program. + * + * @param[in] module HSAIL module. The application can add the same HSAIL module + * to @p program at most once. The HSAIL module must specify the same machine + * model and profile as @p program. If the floating-mode rounding mode of @p + * module is not default, then it should match that of @p program. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_MODULE The HSAIL module is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE The machine model of @p + * module does not match machine model of @p program, or the profile of @p + * module does not match profile of @p program. + * + * @retval ::HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED The HSAIL module is + * already a part of the HSAIL program. + * + * @retval ::HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH Symbol declaration and symbol + * definition compatibility mismatch. See the symbol compatibility rules in the + * HSA Programming Reference Manual. + */ +hsa_status_t HSA_API hsa_ext_program_add_module( + hsa_ext_program_t program, + hsa_ext_module_t module); + +/** + * @brief Iterate over the HSAIL modules in a program, and invoke an + * application-defined callback on every iteration. + * + * @param[in] program HSAIL program. + * + * @param[in] callback Callback to be invoked once per HSAIL module in the + * program. The HSA runtime passes three arguments to the callback: the program, + * a HSAIL module, and the application data. If @p callback returns a status + * other than ::HSA_STATUS_SUCCESS for a particular iteration, the traversal + * stops and ::hsa_ext_program_iterate_modules returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The program is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_iterate_modules( + hsa_ext_program_t program, + hsa_status_t (*callback)(hsa_ext_program_t program, hsa_ext_module_t module, + void* data), + void* data); + +/** + * @brief HSAIL program attributes. + */ +typedef enum { + /** + * Machine model specified when the HSAIL program was created. The type + * of this attribute is ::hsa_machine_model_t. + */ + HSA_EXT_PROGRAM_INFO_MACHINE_MODEL = 0, + /** + * Profile specified when the HSAIL program was created. The type of + * this attribute is ::hsa_profile_t. + */ + HSA_EXT_PROGRAM_INFO_PROFILE = 1, + /** + * Default float rounding mode specified when the HSAIL program was + * created. The type of this attribute is ::hsa_default_float_rounding_mode_t. + */ + HSA_EXT_PROGRAM_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 2 +} hsa_ext_program_info_t; + +/** + * @brief Get the current value of an attribute for a given HSAIL program. + * + * @param[in] program HSAIL program. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behaviour is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * HSAIL program attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_get_info( + hsa_ext_program_t program, + hsa_ext_program_info_t attribute, + void *value); + +/** + * @brief Finalizer-determined call convention. + */ +typedef enum { + /** + * Finalizer-determined call convention. + */ + HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO = -1 +} hsa_ext_finalizer_call_convention_t; + +/** + * @brief Control directives specify low-level information about the + * finalization process. + */ +typedef struct hsa_ext_control_directives_s { + /** + * Bitset indicating which control directives are enabled. The bit assigned to + * a control directive is determined by the corresponding value in + * BrigControlDirective. + * + * If a control directive is disabled, its corresponding field value (if any) + * must be 0. Control directives that are only present or absent (such as + * partial workgroups) have no corresponding field as the presence of the bit + * in this mask is sufficient. + */ + uint64_t control_directives_mask; + /** + * Bitset of HSAIL exceptions that must have the BREAK policy enabled. The bit + * assigned to an HSAIL exception is determined by the corresponding value + * in BrigExceptionsMask. If the kernel contains a enablebreakexceptions + * control directive, the finalizer uses the union of the two masks. + */ + uint16_t break_exceptions_mask; + /** + * Bitset of HSAIL exceptions that must have the DETECT policy enabled. The + * bit assigned to an HSAIL exception is determined by the corresponding value + * in BrigExceptionsMask. If the kernel contains a enabledetectexceptions + * control directive, the finalizer uses the union of the two masks. + */ + uint16_t detect_exceptions_mask; + /** + * Maximum size (in bytes) of dynamic group memory that will be allocated by + * the application for any dispatch of the kernel. If the kernel contains a + * maxdynamicsize control directive, the two values should match. + */ + uint32_t max_dynamic_group_size; + /** + * Maximum number of grid work-items that will be used by the application to + * launch the kernel. If the kernel contains a maxflatgridsize control + * directive, the value of @a max_flat_grid_size must not be greater than the + * value of the directive, and takes precedence. + * + * The value specified for maximum absolute grid size must be greater than or + * equal to the product of the values specified by @a required_grid_size. + * + * If the bit at position BRIG_CONTROL_MAXFLATGRIDSIZE is set in @a + * control_directives_mask, this field must be greater than 0. + */ + uint64_t max_flat_grid_size; + /** + * Maximum number of work-group work-items that will be used by the + * application to launch the kernel. If the kernel contains a + * maxflatworkgroupsize control directive, the value of @a + * max_flat_workgroup_size must not be greater than the value of the + * directive, and takes precedence. + * + * The value specified for maximum absolute grid size must be greater than or + * equal to the product of the values specified by @a required_workgroup_size. + * + * If the bit at position BRIG_CONTROL_MAXFLATWORKGROUPSIZE is set in @a + * control_directives_mask, this field must be greater than 0. + */ + uint32_t max_flat_workgroup_size; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + /** + * Grid size that will be used by the application in any dispatch of the + * kernel. If the kernel contains a requiredgridsize control directive, the + * dimensions should match. + * + * The specified grid size must be consistent with @a required_workgroup_size + * and @a required_dim. Also, the product of the three dimensions must not + * exceed @a max_flat_grid_size. Note that the listed invariants must hold + * only if all the corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDGRIDSIZE is set in @a + * control_directives_mask, the three dimension values must be greater than 0. + */ + uint64_t required_grid_size[3]; + /** + * Work-group size that will be used by the application in any dispatch of the + * kernel. If the kernel contains a requiredworkgroupsize control directive, + * the dimensions should match. + * + * The specified work-group size must be consistent with @a required_grid_size + * and @a required_dim. Also, the product of the three dimensions must not + * exceed @a max_flat_workgroup_size. Note that the listed invariants must + * hold only if all the corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDWORKGROUPSIZE is set in @a + * control_directives_mask, the three dimension values must be greater than 0. + */ + hsa_dim3_t required_workgroup_size; + /** + * Number of dimensions that will be used by the application to launch the + * kernel. If the kernel contains a requireddim control directive, the two + * values should match. + * + * The specified dimensions must be consistent with @a required_grid_size and + * @a required_workgroup_size. This invariant must hold only if all the + * corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDDIM is set in @a + * control_directives_mask, this field must be 1, 2, or 3. + */ + uint8_t required_dim; + /** + * Reserved. Must be 0. + */ + uint8_t reserved2[75]; +} hsa_ext_control_directives_t; + +/** + * @brief Finalize an HSAIL program for a given instruction set architecture. + * + * @details Finalize all of the kernels and indirect functions that belong to + * the same HSAIL program for a specific instruction set architecture (ISA). The + * transitive closure of all functions specified by call or scall must be + * defined. Kernels and indirect functions that are being finalized must be + * defined. Kernels and indirect functions that are referenced in kernels and + * indirect functions being finalized may or may not be defined, but must be + * declared. All the global/readonly segment variables that are referenced in + * kernels and indirect functions being finalized may or may not be defined, but + * must be declared. + * + * @param[in] program HSAIL program. + * + * @param[in] isa Instruction set architecture to finalize for. + * + * @param[in] call_convention A call convention used in a finalization. Must + * have a value between ::HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO (inclusive) + * and the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT in @p + * isa (not inclusive). + * + * @param[in] control_directives Low-level control directives that influence + * the finalization process. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[in] code_object_type Type of code object to produce. + * + * @param[out] code_object Code object generated by the Finalizer, which + * contains the machine code for the kernels and indirect functions in the HSAIL + * program. The code object is independent of the HSAIL module that was used to + * generate it. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p isa is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH The directive in + * the control directive structure and in the HSAIL kernel mismatch, or if the + * same directive is used with a different value in one of the functions used by + * this kernel. + * + * @retval ::HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED The Finalizer + * encountered an error while compiling a kernel or an indirect function. + */ +hsa_status_t HSA_API hsa_ext_program_finalize( + hsa_ext_program_t program, + hsa_isa_t isa, + int32_t call_convention, + hsa_ext_control_directives_t control_directives, + const char *options, + hsa_code_object_type_t code_object_type, + hsa_code_object_t *code_object); + +/** @} */ + +#define hsa_ext_finalizer_1_00 + +typedef struct hsa_ext_finalizer_1_00_pfn_s { + hsa_status_t (*hsa_ext_program_create)( + hsa_machine_model_t machine_model, hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, hsa_ext_program_t *program); + + hsa_status_t (*hsa_ext_program_destroy)(hsa_ext_program_t program); + + hsa_status_t (*hsa_ext_program_add_module)(hsa_ext_program_t program, + hsa_ext_module_t module); + + hsa_status_t (*hsa_ext_program_iterate_modules)( + hsa_ext_program_t program, + hsa_status_t (*callback)(hsa_ext_program_t program, + hsa_ext_module_t module, void *data), + void *data); + + hsa_status_t (*hsa_ext_program_get_info)( + hsa_ext_program_t program, hsa_ext_program_info_t attribute, + void *value); + + hsa_status_t (*hsa_ext_program_finalize)( + hsa_ext_program_t program, hsa_isa_t isa, int32_t call_convention, + hsa_ext_control_directives_t control_directives, const char *options, + hsa_code_object_type_t code_object_type, hsa_code_object_t *code_object); +} hsa_ext_finalizer_1_00_pfn_t; + +#ifdef __cplusplus +} // extern "C" block +#endif // __cplusplus + +#endif // HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_image.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_image.h new file mode 100644 index 0000000000..cad9b50820 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_image.h @@ -0,0 +1,1515 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_EXT_IMAGE_H +#define HSA_EXT_IMAGE_H + +#include "hsa.h" + +#undef HSA_API +#ifdef HSA_EXPORT_IMAGES +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +#ifdef __cplusplus +extern "C" { +#endif /*__cplusplus*/ + +/** \defgroup ext-images Images and Samplers + * @{ + */ + +/** + * @brief Enumeration constants added to ::hsa_status_t by this extension. + * + * @remark Additions to hsa_status_t + */ +enum { + /** + * Image format is not supported. + */ + HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED = 0x3000, + /** + * Image size is not supported. + */ + HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED = 0x3001, + /** + * Image pitch is not supported or invalid. + */ + HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED = 0x3002, + /** + * Sampler descriptor is not supported or invalid. + */ + HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED = 0x3003 +}; + +/** + * @brief Enumeration constants added to ::hsa_agent_info_t by this + * extension. + * + * @remark Additions to hsa_agent_info_t + */ +enum { + /** + * Maximum number of elements in 1D images. Must be at least 16384. The type + * of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS = 0x3000, + /** + * Maximum number of elements in 1DA images. Must be at least 16384. The type + * of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS = 0x3001, + /** + * Maximum number of elements in 1DB images. Must be at least 65536. The type + * of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS = 0x3002, + /** + * Maximum dimensions (width, height) of 2D images, in image elements. The X + * and Y maximums must be at least 16384. The type of this attribute is + * size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS = 0x3003, + /** + * Maximum dimensions (width, height) of 2DA images, in image elements. The X + * and Y maximums must be at least 16384. The type of this attribute is + * size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS = 0x3004, + /** + * Maximum dimensions (width, height) of 2DDEPTH images, in image + * elements. The X and Y maximums must be at least 16384. The type of this + * attribute is size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DDEPTH_MAX_ELEMENTS = 0x3005, + /** + * Maximum dimensions (width, height) of 2DADEPTH images, in image + * elements. The X and Y maximums must be at least 16384. The type of this + * attribute is size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS = 0x3006, + /** + * Maximum dimensions (width, height, depth) of 3D images, in image + * elements. The maximum along any dimension must be at least 2048. The type + * of this attribute is size_t[3]. + */ + HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS = 0x3007, + /** + * Maximum number of image layers in a image array. Must be at least 2048. The + * type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS = 0x3008, + /** + * Maximum number of read-only image handles that can be created for an agent at any one + * time. Must be at least 128. The type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES = 0x3009, + /** + * Maximum number of write-only and read-write image handles (combined) that + * can be created for an agent at any one time. Must be at least 64. The type of this + * attribute is size_t. + */ + HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES = 0x300A, + /** + * Maximum number of sampler handlers that can be created for an agent at any one + * time. Must be at least 16. The type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS = 0x300B, + /** + * Image pitch alignment. The agent only supports linear image data + * layouts with a row pitch that is a multiple of this value. Must be + * a power of 2. The type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_LINEAR_ROW_PITCH_ALIGNMENT = 0x300C +}; + +/** + * @brief Image handle, populated by ::hsa_ext_image_create or + * ::hsa_ext_image_create_with_layout. Image + * handles are only unique within an agent, not across agents. + * + */ +typedef struct hsa_ext_image_s { + /** + * Opaque handle. For a given agent, two handles reference the same object of + * the enclosing type if and only if they are equal. + */ + uint64_t handle; + +} hsa_ext_image_t; + +/** + * @brief Geometry associated with the image. This specifies the + * number of image dimensions and whether the image is an image + * array. See the Image Geometry section in the HSA + * Programming Reference Manual for definitions on each + * geometry. The enumeration values match the BRIG type @p + * hsa_ext_brig_image_geometry_t. + */ +typedef enum { +/** + * One-dimensional image addressed by width coordinate. + */ + HSA_EXT_IMAGE_GEOMETRY_1D = 0, + + /** + * Two-dimensional image addressed by width and height coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2D = 1, + + /** + * Three-dimensional image addressed by width, height, and depth coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_3D = 2, + + /** + * Array of one-dimensional images with the same size and format. 1D arrays + * are addressed by width and index coordinate. + */ + HSA_EXT_IMAGE_GEOMETRY_1DA = 3, + + /** + * Array of two-dimensional images with the same size and format. 2D arrays + * are addressed by width, height, and index coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DA = 4, + + /** + * One-dimensional image addressed by width coordinate. It has + * specific restrictions compared to ::HSA_EXT_IMAGE_GEOMETRY_1D. An + * image with an opaque image data layout will always use a linear + * image data layout, and one with an explicit image data layout + * must specify ::HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR. + */ + HSA_EXT_IMAGE_GEOMETRY_1DB = 5, + + /** + * Two-dimensional depth image addressed by width and height coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DDEPTH = 6, + + /** + * Array of two-dimensional depth images with the same size and format. 2D + * arrays are addressed by width, height, and index coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DADEPTH = 7 +} hsa_ext_image_geometry_t; + +/** + * @brief Channel type associated with the elements of an image. See + * the Channel Type section in the HSA Programming Reference + * Manual for definitions on each channel type. The + * enumeration values and definition match the BRIG type @p + * hsa_ext_brig_image_channel_type_t. + */ +typedef enum { + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0, + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14, + HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15 +} hsa_ext_image_channel_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_image_channel_type_t constants. + */ +typedef uint32_t hsa_ext_image_channel_type32_t; + +/** + * + * @brief Channel order associated with the elements of an image. See + * the Channel Order section in the HSA Programming Reference + * Manual for definitions on each channel order. The + * enumeration values match the BRIG type @p + * hsa_ext_brig_image_channel_order_t. + */ +typedef enum { + HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0, + HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1, + HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2, + HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4, + HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8, + HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9, + HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10, + HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14, + HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15, + HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16, + HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19 +} hsa_ext_image_channel_order_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_image_channel_order_t constants. + */ +typedef uint32_t hsa_ext_image_channel_order32_t; + + +/** + * @brief Image format. + */ +typedef struct hsa_ext_image_format_s { + /** + * Channel type. + */ + hsa_ext_image_channel_type32_t channel_type; + + /** + * Channel order. + */ + hsa_ext_image_channel_order32_t channel_order; +} hsa_ext_image_format_t; + +/** + * @brief Implementation independent image descriptor. + */ +typedef struct hsa_ext_image_descriptor_s { + /** + * Image geometry. + */ + hsa_ext_image_geometry_t geometry; + /** + * Width of the image, in components. + */ + size_t width; + /** + * Height of the image, in components. Only used if the geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_2D, ::HSA_EXT_IMAGE_GEOMETRY_3D, + * HSA_EXT_IMAGE_GEOMETRY_2DA, HSA_EXT_IMAGE_GEOMETRY_2DDEPTH, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0. + */ + size_t height; + /** + * Depth of the image, in components. Only used if the geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D, otherwise must be 0. + */ + size_t depth; + /** + * Number of image layers in the image array. Only used if the geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0. + */ + size_t array_size; + /** + * Image format. + */ + hsa_ext_image_format_t format; +} hsa_ext_image_descriptor_t; + +/** + * @brief Image capability. + */ +typedef enum { + /** + * Images of this geometry, format, and layout are not supported by + * the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED = 0x0, + /** + * Read-only images of this geometry, format, and layout are + * supported by the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_ONLY = 0x1, + /** + * Write-only images of this geometry, format, and layout are + * supported by the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_WRITE_ONLY = 0x2, + /** + * Read-write images of this geometry, format, and layout are + * supported by the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_WRITE = 0x4, + /** + * @deprecated Images of this geometry, format, and layout can be accessed from + * read-modify-write atomic operations in the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_MODIFY_WRITE = 0x8, + /** + * Images of this geometry, format, and layout are guaranteed to + * have a consistent data layout regardless of how they are + * accessed by the associated agent. + */ + HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT = 0x10 +} hsa_ext_image_capability_t; + +/** + * @brief Image data layout. + * + * @details An image data layout denotes such aspects of image data + * layout as tiling and organization of channels in memory. Some image + * data layouts may only apply to specific image geometries, formats, + * and access permissions. Different agents may support different + * image layout identifiers, including vendor specific layouts. Note + * that an agent may not support the same image data layout for + * different access permissions to images with the same image + * geometry, size, and format. If multiple agents support the same + * image data layout then it is possible to use separate image handles + * for each agent that references the same image data. + */ + +typedef enum { + /** + * An implementation specific opaque image data layout which can + * vary depending on the agent, geometry, image format, image size, + * and access permissions. + */ + HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE = 0x0, + /** + * The image data layout is specified by the following rules in + * ascending byte address order. For a 3D image, 2DA image array, + * or 1DA image array, the image data is stored as a linear sequence + * of adjacent 2D image slices, 2D images, or 1D images + * respectively, spaced according to the slice pitch. Each 2D image + * is stored as a linear sequence of adjacent image rows, spaced + * according to the row pitch. Each 1D or 1DB image is stored as a + * single image row. Each image row is stored as a linear sequence + * of image elements. Each image element is stored as a linear + * sequence of image components specified by the left to right + * channel order definition. Each image component is stored using + * the memory type specified by the channel type. + * + * The 1DB image geometry always uses the linear image data layout. + */ + HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR = 0x1 +} hsa_ext_image_data_layout_t; + +/** + * @brief Retrieve the supported image capabilities for a given combination of + * agent, geometry, and image format for an image created with an opaque image + * data layout. + * + * @param[in] agent Agent to be associated with the image handle. + * + * @param[in] geometry Geometry. + * + * @param[in] image_format Pointer to an image format. Must not be NULL. + * + * @param[out] capability_mask Pointer to a memory location where the HSA + * runtime stores a bit-mask of supported image capability + * (::hsa_ext_image_capability_t) values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is + * NULL, or @p capability_mask is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_get_capability( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + +/** + * @brief Retrieve the supported image capabilities for a given combination of + * agent, geometry, image format, and image layout for an image created with + * an explicit image data layout. + * + * @param[in] agent Agent to be associated with the image handle. + * + * @param[in] geometry Geometry. + * + * @param[in] image_format Pointer to an image format. Must not be NULL. + * + * @param[in] image_data_layout The image data layout. + * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use + * ::hsa_ext_image_get_capability instead. + * + * @param[out] capability_mask Pointer to a memory location where the HSA + * runtime stores a bit-mask of supported image capability + * (::hsa_ext_image_capability_t) values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is + * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE, + * or @p capability_mask is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_get_capability_with_layout( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + hsa_ext_image_data_layout_t image_data_layout, + uint32_t *capability_mask); + +/** + * @brief Agent specific image size and alignment requirements, populated by + * ::hsa_ext_image_data_get_info and ::hsa_ext_image_data_get_info_with_layout. + */ +typedef struct hsa_ext_image_data_info_s { + /** + * Image data size, in bytes. + */ + size_t size; + + /** + * Image data alignment, in bytes. Must always be a power of 2. + */ + size_t alignment; + +} hsa_ext_image_data_info_t; + +/** + * @brief Retrieve the image data requirements for a given combination of agent, image + * descriptor, and access permission for an image created with an opaque image + * data layout. + * + * @details The optimal image data size and alignment requirements may + * vary depending on the image attributes specified in @p + * image_descriptor, the @p access_permission, and the @p agent. Also, + * different implementations of the HSA runtime may return different + * requirements for the same input values. + * + * The implementation must return the same image data requirements for + * different access permissions with matching image descriptors as long + * as ::hsa_ext_image_get_capability reports + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image + * descriptors match if they have the same values, with the exception + * that s-form channel orders match the corresponding non-s-form + * channel order and vice versa. + * + * @param[in] agent Agent to be associated with the image handle. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] access_permission Access permission of the image when + * accessed by @p agent. The access permission defines how the agent + * is allowed to access the image and must match the corresponding + * HSAIL image handle type. The @p agent must support the image format + * specified in @p image_descriptor for the given @p + * access_permission. + * + * @param[out] image_data_info Memory location where the runtime stores the + * size and alignment requirements. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The @p + * agent does not support the image format specified by @p + * image_descriptor with the specified @p access_permission. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent + * does not support the image dimensions specified by @p + * image_descriptor with the specified @p access_permission. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * access_permission is not a valid access permission value, or @p + * image_data_info is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_data_get_info( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + +/** + * @brief Retrieve the image data requirements for a given combination of + * image descriptor, access permission, image data layout, image data row pitch, + * and image data slice pitch for an image created with an explicit image + * data layout. + * + * @details The image data size and alignment requirements may vary + * depending on the image attributes specified in @p image_descriptor, + * the @p access_permission, and the image layout. However, different + * implementations of the HSA runtime will return the same + * requirements for the same input values. + * + * The implementation must return the same image data requirements for + * different access permissions with matching image descriptors and + * matching image layouts as long as ::hsa_ext_image_get_capability + * reports + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image + * descriptors match if they have the same values, with the exception + * that s-form channel orders match the corresponding non-s-form + * channel order and vice versa. Image layouts match if they are the + * same image data layout and use the same image row and slice pitch + * values. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] access_permission Access permission of the image when + * accessed by an agent. The access permission defines how the agent + * is allowed to access the image and must match the corresponding + * HSAIL image handle type. + * + * @param[in] image_data_layout The image data layout to use. + * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use + * ::hsa_ext_image_data_get_info instead. + * + * @param[in] image_data_row_pitch The size in bytes for a single row + * of the image in the image data. If 0 is specified then the default + * row pitch value is used: image width * image element byte size. + * The value used must be greater than or equal to the default row + * pitch, and be a multiple of the image element byte size. For the + * linear image layout it must also be a multiple of the image linear + * row pitch alignment for the agents that will access the image data + * using image instructions. + * + * @param[in] image_data_slice_pitch The size in bytes of a single + * slice of a 3D image, or the size in bytes of each image layer in an + * image array in the image data. If 0 is specified then the default + * slice pitch value is used: row pitch * height if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must + * be 0 if the default slice pitch is 0, be greater than or equal to + * the default slice pitch, and be a multiple of the row pitch. + * + * @param[out] image_data_info Memory location where the runtime stores the + * size and alignment requirements. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The image + * format specified by @p image_descriptor is not supported for the + * @p access_permission and @p image_data_layout specified. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The image + * dimensions specified by @p image_descriptor are not supported for + * the @p access_permission and @p image_data_layout specified. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The row and + * slice pitch specified by @p image_data_row_pitch and @p + * image_data_slice_pitch are invalid or not supported. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is + * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE, + * or @p image_data_info is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_data_get_info_with_layout( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_data_info_t *image_data_info); + +/** + * @brief Creates an agent specific image handle to an image with an + * opaque image data layout. + * + * @details Images with an opaque image data layout created with + * different access permissions but matching image descriptors and + * same agent can share the same image data if + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported + * by ::hsa_ext_image_get_capability for the image format specified in + * the image descriptor. Image descriptors match if they have the same + * values, with the exception that s-form channel orders match the + * corresponding non-s-form channel order and vice versa. + * + * If necessary, an application can use image operations (import, + * export, copy, clear) to prepare the image for the intended use + * regardless of the access permissions. + * + * @param[in] agent agent to be associated with the image handle created. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] image_data Image data buffer that must have been allocated + * according to the size and alignment requirements dictated by + * ::hsa_ext_image_data_get_info. Must not be NULL. + * + * Any previous memory contents are preserved upon creation. The application is + * responsible for ensuring that the lifetime of the image data exceeds that of + * all the associated images. + * + * @param[in] access_permission Access permission of the image when + * accessed by agent. The access permission defines how the agent + * is allowed to access the image using the image handle created and + * must match the corresponding HSAIL image handle type. The agent + * must support the image format specified in @p image_descriptor for + * the given @p access_permission. + * + * @param[out] image Pointer to a memory location where the HSA runtime stores + * the newly created image handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent + * does not have the capability to support the image format contained + * in @p image_descriptor using the specified @p access_permission. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent + * does not support the image dimensions specified by @p + * image_descriptor using the specified @p access_permission. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * support the creation of more image handles with the given @p access_permission). + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * image_data is NULL, @p image_data does not have a valid alignment, + * @p access_permission is not a valid access permission + * value, or @p image is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_create( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + +/** + * @brief Creates an agent specific image handle to an image with an explicit + * image data layout. + * + * @details Images with an explicit image data layout created with + * different access permissions but matching image descriptors and + * matching image layout can share the same image data if + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported + * by ::hsa_ext_image_get_capability_with_layout for the image format + * specified in the image descriptor and specified image data + * layout. Image descriptors match if they have the same values, with + * the exception that s-form channel orders match the corresponding + * non-s-form channel order and vice versa. Image layouts match if + * they are the same image data layout and use the same image row and + * slice values. + * + * If necessary, an application can use image operations (import, export, copy, + * clear) to prepare the image for the intended use regardless of the access + * permissions. + * + * @param[in] agent agent to be associated with the image handle created. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] image_data Image data buffer that must have been allocated + * according to the size and alignment requirements dictated by + * ::hsa_ext_image_data_get_info_with_layout. Must not be NULL. + * + * Any previous memory contents are preserved upon creation. The application is + * responsible for ensuring that the lifetime of the image data exceeds that of + * all the associated images. + * + * @param[in] access_permission Access permission of the image when + * accessed by the agent. The access permission defines how the agent + * is allowed to access the image and must match the corresponding + * HSAIL image handle type. The agent must support the image format + * specified in @p image_descriptor for the given @p access_permission + * and @p image_data_layout. + * + * @param[in] image_data_layout The image data layout to use for the + * @p image_data. It is invalid to use + * ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use ::hsa_ext_image_create + * instead. + * + * @param[in] image_data_row_pitch The size in bytes for a single row + * of the image in the image data. If 0 is specified then the default + * row pitch value is used: image width * image element byte size. + * The value used must be greater than or equal to the default row + * pitch, and be a multiple of the image element byte size. For the + * linear image layout it must also be a multiple of the image linear + * row pitch alignment for the agents that will access the image data + * using image instructions. + * + * @param[in] image_data_slice_pitch The size in bytes of a single + * slice of a 3D image, or the size in bytes of each image layer in an + * image array in the image data. If 0 is specified then the default + * slice pitch value is used: row pitch * height if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must + * be 0 if the default slice pitch is 0, be greater than or equal to + * the default slice pitch, and be a multiple of the row pitch. + * + * @param[out] image Pointer to a memory location where the HSA runtime stores + * the newly created image handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent does + * not have the capability to support the image format contained in the image + * descriptor using the specified @p access_permission and @p image_data_layout. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent + * does not support the image dimensions specified by @p + * image_descriptor using the specified @p access_permission and @p + * image_data_layout. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The agent does + * not support the row and slice pitch specified by @p image_data_row_pitch + * and @p image_data_slice_pitch, or the values are invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * support the creation of more image handles with the given @p access_permission). + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * image_data is NULL, @p image_data does not have a valid alignment, + * @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE, + * or @p image is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_create_with_layout( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_t *image); + +/** + * @brief Destroy an image handle previously created using ::hsa_ext_image_create or + * ::hsa_ext_image_create_with_layout. + * + * @details Destroying the image handle does not free the associated image data, + * or modify its contents. The application should not destroy an image handle while + * there are references to it queued for execution or currently being used in a + * kernel dispatch. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] image Image handle to destroy. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + */ +hsa_status_t HSA_API hsa_ext_image_destroy( + hsa_agent_t agent, + hsa_ext_image_t image); + +/** + * @brief Copies a portion of one image (the source) to another image (the + * destination). + * + * @details The source and destination image formats should be the + * same, with the exception that s-form channel orders match the + * corresponding non-s-form channel order and vice versa. For example, + * it is allowed to copy a source image with a channel order of + * HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB to a destination image with a + * channel order of HSA_EXT_IMAGE_CHANNEL_ORDER_RGB. + * + * The source and destination images do not have to be of the same geometry and + * appropriate scaling is performed by the HSA runtime. It is possible to copy + * subregions between any combinations of source and destination geometries, provided + * that the dimensions of the subregions are the same. For example, it is + * allowed to copy a rectangular region from a 2D image to a slice of a 3D + * image. + * + * If the source and destination image data overlap, or the combination of + * offset and range references an out-out-bounds element in any of the images, + * the behavior is undefined. + * + * @param[in] agent Agent associated with both the source and destination image handles. + * + * @param[in] src_image Image handle of source image. The agent associated with the source + * image handle must be identical to that of the destination image. + * + * @param[in] src_offset Pointer to the offset within the source image where to + * copy the data from. Must not be NULL. + * + * @param[in] dst_image Image handle of destination image. + * + * @param[in] dst_offset Pointer to the offset within the destination + * image where to copy the data. Must not be NULL. + * + * @param[in] range Dimensions of the image portion to be copied. The HSA + * runtime computes the size of the image data to be copied using this + * argument. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_offset is + * NULL, @p dst_offset is NULL, or @p range is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_copy( + hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range); + +/** + * @brief Image region. + */ +typedef struct hsa_ext_image_region_s { + /** + * Offset within an image (in coordinates). + */ + hsa_dim3_t offset; + + /** + * Dimension size of the image range (in coordinates). The x, y, and z dimensions + * correspond to width, height, and depth or index respectively. + */ + hsa_dim3_t range; +} hsa_ext_image_region_t; + +/** + * @brief Import a linearly organized image data from memory directly to an + * image handle. + * + * @details This operation updates the image data referenced by the image handle + * from the source memory. The size of the data imported from memory is + * implicitly derived from the image region. + * + * It is the application's responsibility to avoid out of bounds memory access. + * + * None of the source memory or destination image data memory can + * overlap. Overlapping of any of the source and destination image + * data memory within the import operation produces undefined results. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] src_memory Source memory. Must not be NULL. + * + * @param[in] src_row_pitch The size in bytes of a single row of the image in the + * source memory. If the value is smaller than the destination image region + * width * image element byte size, then region width * image element byte + * size is used. + * + * @param[in] src_slice_pitch The size in bytes of a single 2D slice of a 3D image, + * or the size in bytes of each image layer in an image array in the source memory. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the + * value used for @p src_row_pitch, then the value used for @p src_row_pitch is used. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for + * @p src_row_pitch * destination image region height, then the value used for + * @p src_row_pitch * destination image region height is used. + * Otherwise, the value is not used. + * + * @param[in] dst_image Image handle of destination image. + * + * @param[in] image_region Pointer to the image region to be updated. Must not + * be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_memory is NULL, or @p + * image_region is NULL. + * + */ +hsa_status_t HSA_API hsa_ext_image_import( + hsa_agent_t agent, + const void *src_memory, + size_t src_row_pitch, + size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Export the image data to linearly organized memory. + * + * @details The operation updates the destination memory with the image data of + * @p src_image. The size of the data exported to memory is implicitly derived + * from the image region. + * + * It is the application's responsibility to avoid out of bounds memory access. + * + * None of the destination memory or source image data memory can + * overlap. Overlapping of any of the source and destination image + * data memory within the export operation produces undefined results. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] src_image Image handle of source image. + * + * @param[in] dst_memory Destination memory. Must not be NULL. + * + * @param[in] dst_row_pitch The size in bytes of a single row of the image in the + * destination memory. If the value is smaller than the source image region + * width * image element byte size, then region width * image element byte + * size is used. + * + * @param[in] dst_slice_pitch The size in bytes of a single 2D slice of a 3D image, + * or the size in bytes of each image in an image array in the destination memory. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the + * value used for @p dst_row_pitch, then the value used for @p dst_row_pitch is used. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for + * @p dst_row_pitch * source image region height, then the value used for + * @p dst_row_pitch * source image region height is used. + * Otherwise, the value is not used. + * + * @param[in] image_region Pointer to the image region to be exported. Must not + * be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p dst_memory is NULL, or @p + * image_region is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_export( + hsa_agent_t agent, + hsa_ext_image_t src_image, + void *dst_memory, + size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Clear a region of an image so that every image element has + * the specified value. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] image Image handle for image to be cleared. + * + * @param[in] data The value to which to set each image element being + * cleared. It is specified as an array of image component values. The + * number of array elements must match the number of access components + * for the image channel order. The type of each array element must + * match the image access type of the image channel type. When the + * value is used to set the value of an image element, the conversion + * method corresponding to the image channel type is used. See the + * Channel Order section and Channel Type section in + * the HSA Programming Reference Manual for more + * information. Must not be NULL. + * + * @param[in] image_region Pointer to the image region to clear. Must not be + * NULL. If the region references an out-out-bounds element, the behavior is + * undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p data is NULL, or @p + * image_region is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_clear( + hsa_agent_t agent, + hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Sampler handle. Samplers are populated by + * ::hsa_ext_sampler_create or ::hsa_ext_sampler_create_v2. Sampler handles are only unique + * within an agent, not across agents. + */ +typedef struct hsa_ext_sampler_s { + /** + * Opaque handle. For a given agent, two handles reference the same object of + * the enclosing type if and only if they are equal. + */ + uint64_t handle; +} hsa_ext_sampler_t; + +/** + * @brief Sampler address modes. The sampler address mode describes + * the processing of out-of-range image coordinates. See the + * Addressing Mode section in the HSA Programming Reference + * Manual for definitions on each address mode. The values + * match the BRIG type @p hsa_ext_brig_sampler_addressing_t. + */ +typedef enum { + /** + * Out-of-range coordinates are not handled. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED = 0, + + /** + * Clamp out-of-range coordinates to the image edge. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE = 1, + + /** + * Clamp out-of-range coordinates to the image border color. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER = 2, + + /** + * Wrap out-of-range coordinates back into the valid coordinate + * range so the image appears as repeated tiles. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT = 3, + + /** + * Mirror out-of-range coordinates back into the valid coordinate + * range so the image appears as repeated tiles with every other + * tile a reflection. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT = 4 + +} hsa_ext_sampler_addressing_mode_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_sampler_addressing_mode_t constants. + */ +typedef uint32_t hsa_ext_sampler_addressing_mode32_t; + +/** + * @brief Sampler coordinate normalization modes. See the + * Coordinate Normalization Mode section in the HSA + * Programming Reference Manual for definitions on each + * coordinate normalization mode. The values match the BRIG type @p + * hsa_ext_brig_sampler_coord_normalization_t. + */ +typedef enum { + + /** + * Coordinates are used to directly address an image element. + */ + HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED = 0, + + /** + * Coordinates are scaled by the image dimension size before being + * used to address an image element. + */ + HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED = 1 + +} hsa_ext_sampler_coordinate_mode_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_sampler_coordinate_mode_t constants. + */ +typedef uint32_t hsa_ext_sampler_coordinate_mode32_t; + + +/** + * @brief Sampler filter modes. See the Filter Mode section + * in the HSA Programming Reference Manual for definitions + * on each address mode. The enumeration values match the BRIG type @p + * hsa_ext_brig_sampler_filter_t. + */ +typedef enum { + /** + * Filter to the image element nearest (in Manhattan distance) to the + * specified coordinate. + */ + HSA_EXT_SAMPLER_FILTER_MODE_NEAREST = 0, + + /** + * Filter to the image element calculated by combining the elements in a 2x2 + * square block or 2x2x2 cube block around the specified coordinate. The + * elements are combined using linear interpolation. + */ + HSA_EXT_SAMPLER_FILTER_MODE_LINEAR = 1 + +} hsa_ext_sampler_filter_mode_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_sampler_filter_mode_t constants. + */ +typedef uint32_t hsa_ext_sampler_filter_mode32_t; + +/** + * @brief Implementation independent sampler descriptor. + */ +typedef struct hsa_ext_sampler_descriptor_s { + /** + * Sampler coordinate mode describes the normalization of image coordinates. + */ + hsa_ext_sampler_coordinate_mode32_t coordinate_mode; + + /** + * Sampler filter type describes the type of sampling performed. + */ + hsa_ext_sampler_filter_mode32_t filter_mode; + + /** + * Sampler address mode describes the processing of out-of-range image + * coordinates. + */ + hsa_ext_sampler_addressing_mode32_t address_mode; +} hsa_ext_sampler_descriptor_t; + +/** + * @brief Implementation independent sampler descriptor v2 which supports + * different address modes in X, Y and Z axises. + */ +typedef struct hsa_ext_sampler_descriptor_v2_s { + /** + * Sampler coordinate mode describes the normalization of image coordinates. + */ + hsa_ext_sampler_coordinate_mode32_t coordinate_mode; + + /** + * Sampler filter type describes the type of sampling performed. + */ + hsa_ext_sampler_filter_mode32_t filter_mode; + + /** + * Sampler address mode describes the processing of out-of-range image + * coordinates. + */ + hsa_ext_sampler_addressing_mode32_t address_modes[3]; // in X, Y and Z axises +} hsa_ext_sampler_descriptor_v2_t; + +/** + * @brief Create an agent specific sampler handle for a given agent + * independent sampler descriptor and agent. + * + * @param[in] agent Agent to be associated with the sampler handle created. + * + * @param[in] sampler_descriptor Pointer to a sampler descriptor. Must not be + * NULL. + * + * @param[out] sampler Memory location where the HSA runtime stores the newly + * created sampler handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED The + * @p agent does not have the capability to support the properties + * specified by @p sampler_descriptor or it is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or + * @p sampler is NULL. + */ +hsa_status_t HSA_API hsa_ext_sampler_create( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + +/** + * @brief Create an agent specific sampler handle for a given agent + * independent sampler descriptor v2 and agent. + * + * @param[in] agent Agent to be associated with the sampler handle created. + * + * @param[in] sampler_descriptor v2 Pointer to a sampler descriptor. Must not be + * NULL. + * + * @param[out] sampler Memory location where the HSA runtime stores the newly + * created sampler handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED The + * @p agent does not have the capability to support the properties + * specified by @p sampler_descriptor or it is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or + * @p sampler is NULL. + */ +hsa_status_t HSA_API hsa_ext_sampler_create_v2( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_v2_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + +/** + * @brief Destroy a sampler handle previously created using ::hsa_ext_sampler_create or + * ::hsa_ext_sampler_create_v2. + * + * @details The sampler handle should not be destroyed while there are + * references to it queued for execution or currently being used in a + * kernel dispatch. + * + * @param[in] agent Agent associated with the sampler handle. + * + * @param[in] sampler Sampler handle to destroy. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + */ +hsa_status_t HSA_API hsa_ext_sampler_destroy( + hsa_agent_t agent, + hsa_ext_sampler_t sampler); + + +#define hsa_ext_images_1_00 + +/** + * @brief The function pointer table for the images v1.00 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ext_images_1_00_pfn_s { + + hsa_status_t (*hsa_ext_image_get_capability)( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + + hsa_status_t (*hsa_ext_image_destroy)( + hsa_agent_t agent, + hsa_ext_image_t image); + + hsa_status_t (*hsa_ext_image_copy)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range); + + hsa_status_t (*hsa_ext_image_import)( + hsa_agent_t agent, + const void *src_memory, + size_t src_row_pitch, + size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_export)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + void *dst_memory, + size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_clear)( + hsa_agent_t agent, + hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_sampler_create)( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + + hsa_status_t (*hsa_ext_sampler_destroy)( + hsa_agent_t agent, + hsa_ext_sampler_t sampler); + +} hsa_ext_images_1_00_pfn_t; + +#define hsa_ext_images_1 + +/** + * @brief The function pointer table for the images v1 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ext_images_1_pfn_s { + + hsa_status_t (*hsa_ext_image_get_capability)( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + + hsa_status_t (*hsa_ext_image_destroy)( + hsa_agent_t agent, + hsa_ext_image_t image); + + hsa_status_t (*hsa_ext_image_copy)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range); + + hsa_status_t (*hsa_ext_image_import)( + hsa_agent_t agent, + const void *src_memory, + size_t src_row_pitch, + size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_export)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + void *dst_memory, + size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_clear)( + hsa_agent_t agent, + hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_sampler_create)( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + + hsa_status_t (*hsa_ext_sampler_destroy)( + hsa_agent_t agent, + hsa_ext_sampler_t sampler); + + hsa_status_t (*hsa_ext_image_get_capability_with_layout)( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + hsa_ext_image_data_layout_t image_data_layout, + uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info_with_layout)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create_with_layout)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_t *image); + + hsa_status_t (*hsa_ext_sampler_create_v2)( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_v2_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + +} hsa_ext_images_1_pfn_t; +/** @} */ + +#ifdef __cplusplus +} // end extern "C" block +#endif /*__cplusplus*/ + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_aqlprofile.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_aqlprofile.h new file mode 100644 index 0000000000..a49221c49e --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_aqlprofile.h @@ -0,0 +1,488 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_ +#define OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_ + +#include +#include "hsa.h" + +#define HSA_AQLPROFILE_VERSION_MAJOR 2 +#define HSA_AQLPROFILE_VERSION_MINOR 0 + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +//////////////////////////////////////////////////////////////////////////////// +// Library version +uint32_t hsa_ven_amd_aqlprofile_version_major(); +uint32_t hsa_ven_amd_aqlprofile_version_minor(); + +/////////////////////////////////////////////////////////////////////// +// Library API: +// The library provides helper methods for instantiation of +// the profile context object and for populating of the start +// and stop AQL packets. The profile object contains a profiling +// events list and needed for profiling buffers descriptors, +// a command buffer and an output data buffer. To check if there +// was an error the library methods return a status code. Also +// the library provides methods for querying required buffers +// attributes, to validate the event attributes and to get profiling +// output data. +// +// Returned status: +// hsa_status_t – HSA status codes are used from hsa.h header +// +// Supported profiling features: +// +// Supported profiling events +typedef enum { + HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC = 0, + HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE = 1, +} hsa_ven_amd_aqlprofile_event_type_t; + +// Supported performance counters (PMC) blocks +// The block ID is the same for a block instances set, for example +// each block instance from the TCC block set, TCC0, TCC1, …, TCCN +// will have the same block ID HSA_VEN_AMD_AQLPROFILE_BLOCKS_TCC. +typedef enum { + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC = 0, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF = 1, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS = 2, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM = 3, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE = 4, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI = 5, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ = 6, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS = 7, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM = 8, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX = 9, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA = 10, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA = 11, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC = 12, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP = 13, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD = 14, + // Memory related blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB = 15, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB = 16, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM = 17, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ = 18, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2 = 19, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR = 20, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC = 21, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2 = 22, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA = 23, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB = 24, + // System blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA = 25, + // GFX10 added blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A = 26, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C = 27, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A = 28, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C = 29, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR = 30, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS = 31, + + // UMC & MMEA System Blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC = 32, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA = 33, + + HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER +} hsa_ven_amd_aqlprofile_block_name_t; + +// PMC event object structure +// ‘counter_id’ value is specified in GFXIPs perfcounter user guides +// which is the counters select value, “Performance Counters Selection” +// chapter. +typedef struct { + hsa_ven_amd_aqlprofile_block_name_t block_name; + uint32_t block_index; + uint32_t counter_id; +} hsa_ven_amd_aqlprofile_event_t; + +// Check if event is valid for the specific GPU +hsa_status_t hsa_ven_amd_aqlprofile_validate_event( + hsa_agent_t agent, // HSA handle for the profiling GPU + const hsa_ven_amd_aqlprofile_event_t* event, // [in] Pointer on validated event + bool* result); // [out] True if the event valid, False otherwise + +// Profiling parameters +// All parameters are generic and if not applicable for a specific +// profile configuration then error status will be returned. +typedef enum { + /** + * Select the target compute unit (wgp) for profiling. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET = 0, + /** + * VMID Mask + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK = 1, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK = 2, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK = 3, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2 = 4, + /** + * Shader engine mask for selection. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK = 5, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE = 6, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT = 7, + /** + * Set SIMD Mask (GFX9) or SIMD ID for collection (Navi) + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION = 8, + /** + * Set true for occupancy collection only. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_OCCUPANCY_MODE = 9, + /** + * ATT collection max data size, in MB. Shared among shader engines. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE = 10, + /** + * Mask of which compute units to generate perfcounters. GFX9 only. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK = 240, + /** + * Select collection period for perfcounters. GFX9 only. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_CTRL = 241, + /** + * Select perfcounter ID (SQ block) for collection. GFX9 only. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_NAME = 242, +} hsa_ven_amd_aqlprofile_parameter_name_t; + +// Profile parameter object +typedef struct { + hsa_ven_amd_aqlprofile_parameter_name_t parameter_name; + uint32_t value; +} hsa_ven_amd_aqlprofile_parameter_t; + +typedef enum { + HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_0 = 0, + HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_1, + HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_2, + HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_3 +} hsa_ven_amd_aqlprofile_att_marker_channel_t; + +// +// Profile context object: +// The library provides a profile object structure which contains +// the events array, a buffer for the profiling start/stop commands +// and a buffer for the output data. +// The buffers are specified by the buffer descriptors and allocated +// by the application. The buffers allocation attributes, the command +// buffer size, the PMC output buffer size as well as profiling output +// data can be get using the generic get profile info helper _get_info. +// +// Buffer descriptor +typedef struct { + void* ptr; + uint32_t size; +} hsa_ven_amd_aqlprofile_descriptor_t; + +// Profile context object structure, contains profiling events list and +// needed for profiling buffers descriptors, a command buffer and +// an output data buffer +typedef struct { + hsa_agent_t agent; // GFXIP handle + hsa_ven_amd_aqlprofile_event_type_t type; // Events type + const hsa_ven_amd_aqlprofile_event_t* events; // Events array + uint32_t event_count; // Events count + const hsa_ven_amd_aqlprofile_parameter_t* parameters; // Parameters array + uint32_t parameter_count; // Parameters count + hsa_ven_amd_aqlprofile_descriptor_t output_buffer; // Output buffer + hsa_ven_amd_aqlprofile_descriptor_t command_buffer; // PM4 commands +} hsa_ven_amd_aqlprofile_profile_t; + +// +// AQL packets populating methods: +// The helper methods to populate provided by the application START and +// STOP AQL packets which the application is required to submit before and +// after profiled GPU task packets respectively. +// +// AQL Vendor Specific packet which carries a PM4 command +typedef struct { + uint16_t header; + uint16_t pm4_command[27]; + hsa_signal_t completion_signal; +} hsa_ext_amd_aql_pm4_packet_t; + +// Method to populate the provided AQL packet with profiling start commands +// Only 'pm4_command' fields of the packet are set and the application +// is responsible to set Vendor Specific header type a completion signal +hsa_status_t hsa_ven_amd_aqlprofile_start( + hsa_ven_amd_aqlprofile_profile_t* profile, // [in,out] profile context object + hsa_ext_amd_aql_pm4_packet_t* aql_start_packet); // [out] profile start AQL packet + +// Method to populate the provided AQL packet with profiling stop commands +// Only 'pm4_command' fields of the packet are set and the application +// is responsible to set Vendor Specific header type and a completion signal +hsa_status_t hsa_ven_amd_aqlprofile_stop( + const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object + hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet); // [out] profile stop AQL packet + +// Method to populate the provided AQL packet with profiling read commands +// Only 'pm4_command' fields of the packet are set and the application +// is responsible to set Vendor Specific header type and a completion signal +hsa_status_t hsa_ven_amd_aqlprofile_read( + const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object + hsa_ext_amd_aql_pm4_packet_t* aql_read_packet); // [out] profile stop AQL packet + +// Legacy devices, PM4 profiling packet size +const unsigned HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE = 192; +// Legacy devices, converting the profiling AQL packet to PM4 packet blob +hsa_status_t hsa_ven_amd_aqlprofile_legacy_get_pm4( + const hsa_ext_amd_aql_pm4_packet_t* aql_packet, // [in] AQL packet + void* data); // [out] PM4 packet blob + +// Method to add a marker (correlation ID) into the ATT buffer. +hsa_status_t hsa_ven_amd_aqlprofile_att_marker( + hsa_ven_amd_aqlprofile_profile_t* profile, // [in,out] profile context object + hsa_ext_amd_aql_pm4_packet_t* aql_marker_packet, // [out] profile marker AQL packet + uint32_t data, // [in] Data to be inserted + hsa_ven_amd_aqlprofile_att_marker_channel_t channel); // [in] Comm channel + +// +// Get profile info: +// Generic method for getting various profile info including profile buffers +// attributes like the command buffer size and the profiling PMC results. +// It’s implied that all counters are 64bit values. +// +// Profile generic output data: +typedef struct { + uint32_t sample_id; // PMC sample or trace buffer index + union { + struct { + hsa_ven_amd_aqlprofile_event_t event; // PMC event + uint64_t result; // PMC result + } pmc_data; + hsa_ven_amd_aqlprofile_descriptor_t trace_data; // Trace output data descriptor + }; +} hsa_ven_amd_aqlprofile_info_data_t; + +// ID query type +typedef struct { + const char* name; + uint32_t id; + uint32_t instance_count; +} hsa_ven_amd_aqlprofile_id_query_t; + +// Profile attributes +typedef enum { + HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE = 0, // get_info returns uint32_t value + HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE = 1, // get_info returns uint32_t value + HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA = 2, // get_info returns PMC uint64_t value + // in info_data object + HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA = 3, // get_info returns trace buffer ptr/size + // in info_data object + HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS = 4, // get_info returns number of block counter + HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID = 5, // get_info returns block id, instances + // by name string using _id_query_t + HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD = 6, // get_info returns size/pointer for + // counters enable command buffer + HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD = 7, // get_info returns size/pointer for + // counters disable command buffer +} hsa_ven_amd_aqlprofile_info_type_t; + + +// Definition of output data iterator callback +typedef hsa_status_t (*hsa_ven_amd_aqlprofile_data_callback_t)( + hsa_ven_amd_aqlprofile_info_type_t info_type, // [in] data type, PMC or trace data + hsa_ven_amd_aqlprofile_info_data_t* info_data, // [in] info_data object + void* callback_data); // [in,out] data passed to the callback + +// Method for getting the profile info +hsa_status_t hsa_ven_amd_aqlprofile_get_info( + const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object + hsa_ven_amd_aqlprofile_info_type_t attribute, // [in] requested profile attribute + void* value); // [in,out] returned value + +// Method for iterating the events output data +hsa_status_t hsa_ven_amd_aqlprofile_iterate_data( + const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object + hsa_ven_amd_aqlprofile_data_callback_t callback, // [in] callback to iterate the output data + void* data); // [in,out] data passed to the callback + +// Return error string +hsa_status_t hsa_ven_amd_aqlprofile_error_string( + const char** str); // [out] pointer on the error string + +/** + * @brief Callback for iteration of all possible event coordinate IDs and coordinate names. + */ +typedef hsa_status_t(*hsa_ven_amd_aqlprofile_eventname_callback_t)(int id, const char* name); +/** + * @brief Iterate over all possible event coordinate IDs and their names. + */ +hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_ids(hsa_ven_amd_aqlprofile_eventname_callback_t); + +/** + * @brief Iterate over all event coordinates for a given agent_t and event_t. + * @param position A counting sequence indicating callback number. + * @param id Coordinate ID as in _iterate_event_ids. + * @param extent Coordinate extent indicating maximum allowed instances. + * @param coordinate The coordinate, in the range [0,extent-1]. + * @param name Coordinate name as in _iterate_event_ids. + * @param userdata Userdata returned from _iterate_event_coord function. + */ +typedef hsa_status_t(*hsa_ven_amd_aqlprofile_coordinate_callback_t)( + int position, + int id, + int extent, + int coordinate, + const char* name, + void* userdata +); + +/** + * @brief Iterate over all event coordinates for a given agent_t and event_t. + * @param[in] agent HSA agent. + * @param[in] event The event ID and block ID to iterate for. + * @param[in] sample_id aqlprofile_info_data_t.sample_id returned from _aqlprofile_iterate_data. + * @param[in] callback Callback function to return the coordinates. + * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback. + */ +hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_coord( + hsa_agent_t agent, + hsa_ven_amd_aqlprofile_event_t event, + uint32_t sample_id, + hsa_ven_amd_aqlprofile_coordinate_callback_t callback, + void* userdata +); + +/** + * @brief Extension version. + */ +#define hsa_ven_amd_aqlprofile_VERSION_MAJOR 1 +#define hsa_ven_amd_aqlprofile_LIB(suff) "libhsa-amd-aqlprofile" suff ".so" + +#ifdef HSA_LARGE_MODEL +static const char kAqlProfileLib[] = hsa_ven_amd_aqlprofile_LIB("64"); +#else +static const char kAqlProfileLib[] = hsa_ven_amd_aqlprofile_LIB(""); +#endif + +/** + * @brief Extension function table. + */ +typedef struct hsa_ven_amd_aqlprofile_1_00_pfn_s { + uint32_t (*hsa_ven_amd_aqlprofile_version_major)(); + uint32_t (*hsa_ven_amd_aqlprofile_version_minor)(); + + hsa_status_t (*hsa_ven_amd_aqlprofile_error_string)( + const char** str); + + hsa_status_t (*hsa_ven_amd_aqlprofile_validate_event)( + hsa_agent_t agent, + const hsa_ven_amd_aqlprofile_event_t* event, + bool* result); + + hsa_status_t (*hsa_ven_amd_aqlprofile_start)( + hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ext_amd_aql_pm4_packet_t* aql_start_packet); + + hsa_status_t (*hsa_ven_amd_aqlprofile_stop)( + const hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet); + + hsa_status_t (*hsa_ven_amd_aqlprofile_read)( + const hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ext_amd_aql_pm4_packet_t* aql_read_packet); + + hsa_status_t (*hsa_ven_amd_aqlprofile_legacy_get_pm4)( + const hsa_ext_amd_aql_pm4_packet_t* aql_packet, + void* data); + + hsa_status_t (*hsa_ven_amd_aqlprofile_get_info)( + const hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ven_amd_aqlprofile_info_type_t attribute, + void* value); + + hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_data)( + const hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ven_amd_aqlprofile_data_callback_t callback, + void* data); + + hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_event_ids)( + hsa_ven_amd_aqlprofile_eventname_callback_t + ); + + hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_event_coord)( + hsa_agent_t agent, + hsa_ven_amd_aqlprofile_event_t event, + uint32_t sample_id, + hsa_ven_amd_aqlprofile_coordinate_callback_t callback, + void* userdata + ); + + hsa_status_t (*hsa_ven_amd_aqlprofile_att_marker)( + hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ext_amd_aql_pm4_packet_t* aql_packet, + uint32_t data, + hsa_ven_amd_aqlprofile_att_marker_channel_t channel + ); +} hsa_ven_amd_aqlprofile_1_00_pfn_t; + +typedef hsa_ven_amd_aqlprofile_1_00_pfn_t hsa_ven_amd_aqlprofile_pfn_t; + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_ diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_loader.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_loader.h new file mode 100644 index 0000000000..47236c86e9 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_loader.h @@ -0,0 +1,667 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA AMD extension for additional loader functionality. + +#ifndef HSA_VEN_AMD_LOADER_H +#define HSA_VEN_AMD_LOADER_H + +#include "hsa.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** + * @brief Queries equivalent host address for given @p device_address, and + * records it in @p host_address. + * + * + * @details Contents of memory pointed to by @p host_address would be identical + * to contents of memory pointed to by @p device_address. Only difference + * between the two is host accessibility: @p host_address is always accessible + * from host, @p device_address might not be accessible from host. + * + * If @p device_address already points to host accessible memory, then the value + * of @p device_address is simply copied into @p host_address. + * + * The lifetime of @p host_address is the same as the lifetime of @p + * device_address, and both lifetimes are limited by the lifetime of the + * executable that is managing these addresses. + * + * + * @param[in] device_address Device address to query equivalent host address + * for. + * + * @param[out] host_address Pointer to application-allocated buffer to record + * queried equivalent host address in. + * + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p device_address is invalid or + * null, or @p host_address is null. + */ +hsa_status_t hsa_ven_amd_loader_query_host_address( + const void *device_address, + const void **host_address); + +/** + * @brief The storage type of the code object that is backing loaded memory + * segment. + */ +typedef enum { + /** + * Loaded memory segment is not backed by any code object (anonymous), as the + * case would be with BSS (uninitialized data). + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE = 0, + /** + * Loaded memory segment is backed by the code object that is stored in the + * file. + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE = 1, + /** + * Loaded memory segment is backed by the code object that is stored in the + * memory. + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY = 2 +} hsa_ven_amd_loader_code_object_storage_type_t; + +/** + * @brief Loaded memory segment descriptor. + * + * + * @details Loaded memory segment descriptor describes underlying loaded memory + * segment. Loaded memory segment is created/allocated by the executable during + * the loading of the code object that is backing underlying memory segment. + * + * The lifetime of underlying memory segment is limited by the lifetime of the + * executable that is managing underlying memory segment. + */ +typedef struct hsa_ven_amd_loader_segment_descriptor_s { + /** + * Agent underlying memory segment is allocated on. If the code object that is + * backing underlying memory segment is program code object, then 0. + */ + hsa_agent_t agent; + /** + * Executable that is managing this underlying memory segment. + */ + hsa_executable_t executable; + /** + * Storage type of the code object that is backing underlying memory segment. + */ + hsa_ven_amd_loader_code_object_storage_type_t code_object_storage_type; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then null; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then null-terminated + * filepath to the code object; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then host + * accessible pointer to the first byte of the code object. + */ + const void *code_object_storage_base; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then the length of + * the filepath to the code object (including null-terminating character); + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then the size, in + * bytes, of the memory occupied by the code object. + */ + size_t code_object_storage_size; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0; + * - other, then offset, in bytes, from the beginning of the code object to + * the first byte in the code object data is copied from. + */ + size_t code_object_storage_offset; + /** + * Starting address of the underlying memory segment. + */ + const void *segment_base; + /** + * Size, in bytes, of the underlying memory segment. + */ + size_t segment_size; +} hsa_ven_amd_loader_segment_descriptor_t; + +/** + * @brief Either queries loaded memory segment descriptors, or total number of + * loaded memory segment descriptors. + * + * + * @details If @p segment_descriptors is not null and @p num_segment_descriptors + * points to number that exactly matches total number of loaded memory segment + * descriptors, then queries loaded memory segment descriptors, and records them + * in @p segment_descriptors. If @p segment_descriptors is null and @p + * num_segment_descriptors points to zero, then queries total number of loaded + * memory segment descriptors, and records it in @p num_segment_descriptors. In + * all other cases returns appropriate error code (see below). + * + * The caller of this function is responsible for the allocation/deallocation + * and the lifetime of @p segment_descriptors and @p num_segment_descriptors. + * + * The lifetime of loaded memory segments that are described by queried loaded + * memory segment descriptors is limited by the lifetime of the executable that + * is managing loaded memory segments. + * + * Queried loaded memory segment descriptors are always self-consistent: they + * describe a complete set of loaded memory segments that are being backed by + * fully loaded code objects that are present at the time (i.e. this function + * is blocked until all executable manipulations are fully complete). + * + * + * @param[out] segment_descriptors Pointer to application-allocated buffer to + * record queried loaded memory segment descriptors in. Can be null if @p + * num_segment_descriptors points to zero. + * + * @param[in,out] num_segment_descriptors Pointer to application-allocated + * buffer that contains either total number of loaded memory segment descriptors + * or zero. + * + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p segment_descriptors is null + * while @p num_segment_descriptors points to non-zero number, @p + * segment_descriptors is not null while @p num_segment_descriptors points to + * zero, or @p num_segment_descriptors is null. + * + * @retval HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p num_segment_descriptors + * does not point to number that exactly matches total number of loaded memory + * segment descriptors. + */ +hsa_status_t hsa_ven_amd_loader_query_segment_descriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + +/** + * @brief Obtains the handle of executable to which the device address belongs. + * + * @details This method should not be used to obtain executable handle by using + * a host address. The executable returned is expected to be alive until its + * destroyed by the user. + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT The input is invalid or there + * is no exectuable found for this kernel code object. + */ +hsa_status_t hsa_ven_amd_loader_query_executable( + const void *device_address, + hsa_executable_t *executable); + +//===----------------------------------------------------------------------===// + +/** + * @brief Iterate over the loaded code objects in an executable, and invoke + * an application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per loaded code object. The + * HSA runtime passes three arguments to the callback: the executable, a + * loaded code object, and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and + * ::hsa_ven_amd_loader_executable_iterate_loaded_code_objects returns that + * status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + +/** + * @brief Loaded code object kind. + */ +typedef enum { + /** + * Program code object. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_PROGRAM = 1, + /** + * Agent code object. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT = 2 +} hsa_ven_amd_loader_loaded_code_object_kind_t; + +/** + * @brief Loaded code object attributes. + */ +typedef enum hsa_ven_amd_loader_loaded_code_object_info_e { + /** + * The executable in which this loaded code object is loaded. The + * type of this attribute is ::hsa_executable_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_EXECUTABLE = 1, + /** + * The kind of this loaded code object. The type of this attribute is + * ::uint32_t interpreted as ::hsa_ven_amd_loader_loaded_code_object_kind_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND = 2, + /** + * The agent on which this loaded code object is loaded. The + * value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND is + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT. The type of this + * attribute is ::hsa_agent_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_AGENT = 3, + /** + * The storage type of the code object reader used to load the loaded code object. + * The type of this attribute is ::uint32_t interpreted as a + * ::hsa_ven_amd_loader_code_object_storage_type_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE = 4, + /** + * The memory address of the first byte of the code object that was loaaded. + * The value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is + * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this + * attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_BASE = 5, + /** + * The memory size in bytes of the code object that was loaaded. + * The value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is + * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this + * attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_SIZE = 6, + /** + * The file descriptor of the code object that was loaaded. + * The value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is + * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE. The type of this + * attribute is ::int. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_FILE = 7, + /** + * The signed byte address difference of the memory address at which the code + * object is loaded minus the virtual address specified in the code object + * that is loaded. The value of this attribute is only defined if the + * executable in which the code object is loaded is froozen. The type of this + * attribute is ::int64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA = 8, + /** + * The base memory address at which the code object is loaded. This is the + * base address of the allocation for the lowest addressed segment of the code + * object that is loaded. Note that any non-loaded segments before the first + * loaded segment are ignored. The value of this attribute is only defined if + * the executable in which the code object is loaded is froozen. The type of + * this attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE = 9, + /** + * The byte size of the loaded code objects contiguous memory allocation. The + * value of this attribute is only defined if the executable in which the code + * object is loaded is froozen. The type of this attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE = 10, + /** + * The length of the URI in bytes, not including the NUL terminator. The type + * of this attribute is uint32_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH = 11, + /** + * The URI name from which the code object was loaded. The type of this + * attribute is a NUL terminated \p char* with the length equal to the value + * of ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH attribute. + * The URI name syntax is defined by the following BNF syntax: + * + * code_object_uri ::== file_uri | memory_uri + * file_uri ::== "file://" file_path [ range_specifier ] + * memory_uri ::== "memory://" process_id range_specifier + * range_specifier ::== [ "#" | "?" ] "offset=" number "&" "size=" number + * file_path ::== URI_ENCODED_OS_FILE_PATH + * process_id ::== DECIMAL_NUMBER + * number ::== HEX_NUMBER | DECIMAL_NUMBER | OCTAL_NUMBER + * + * ``number`` is a C integral literal where hexadecimal values are prefixed by + * "0x" or "0X", and octal values by "0". + * + * ``file_path`` is the file's path specified as a URI encoded UTF-8 string. + * In URI encoding, every character that is not in the regular expression + * ``[a-zA-Z0-9/_.~-]`` is encoded as two uppercase hexidecimal digits + * proceeded by "%". Directories in the path are separated by "/". + * + * ``offset`` is a 0-based byte offset to the start of the code object. For a + * file URI, it is from the start of the file specified by the ``file_path``, + * and if omitted defaults to 0. For a memory URI, it is the memory address + * and is required. + * + * ``size`` is the number of bytes in the code object. For a file URI, if + * omitted it defaults to the size of the file. It is required for a memory + * URI. + * + * ``process_id`` is the identity of the process owning the memory. For Linux + * it is the C unsigned integral decimal literal for the process ID (PID). + * + * For example: + * + * file:///dir1/dir2/file1 + * file:///dir3/dir4/file2#offset=0x2000&size=3000 + * memory://1234#offset=0x20000&size=3000 + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI = 12, +} hsa_ven_amd_loader_loaded_code_object_info_t; + +/** + * @brief Get the current value of an attribute for a given loaded code + * object. + * + * @param[in] loaded_code_object Loaded code object. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The loaded code object is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * loaded code object attribute, or @p value is NULL. + */ +hsa_status_t hsa_ven_amd_loader_loaded_code_object_get_info( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + +//===----------------------------------------------------------------------===// + +/** + * @brief Create a code object reader to operate on a file with size and offset. + * + * @param[in] file File descriptor. The file must have been opened by + * application with at least read permissions prior calling this function. The + * file must contain a vendor-specific code object. + * + * The file is owned and managed by the application; the lifetime of the file + * descriptor must exceed that of any associated code object reader. + * + * @param[in] size Size of the code object embedded in @p file. + * + * @param[in] offset 0-based offset relative to the beginning of the @p file + * that denotes the beginning of the code object embedded within the @p file. + * + * @param[out] code_object_reader Memory location to store the newly created + * code object reader handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is not opened with at least + * read permissions. This condition may also be reported as + * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER by the + * ::hsa_executable_load_agent_code_object function. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The bytes starting at offset + * do not form a valid code object. If file size is 0. Or offset > file size. + * This condition may also be reported as + * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT by the + * ::hsa_executable_load_agent_code_object function. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL. + */ +hsa_status_t +hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); + +//===----------------------------------------------------------------------===// + +/** + * @brief Iterate over the available executables, and invoke an + * application-defined callback on every iteration. While + * ::hsa_ven_amd_loader_iterate_executables is executing any calls to + * ::hsa_executable_create, ::hsa_executable_create_alt, or + * ::hsa_executable_destroy will be blocked. + * + * @param[in] callback Callback to be invoked once per executable. The HSA + * runtime passes two arguments to the callback: the executable and the + * application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_ven_amd_loader_iterate_executables returns that status value. If + * @p callback invokes ::hsa_executable_create, ::hsa_executable_create_alt, or + * ::hsa_executable_destroy then the behavior is undefined. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. +*/ +hsa_status_t +hsa_ven_amd_loader_iterate_executables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); + +//===----------------------------------------------------------------------===// + +/** + * @brief Extension version. + */ +#define hsa_ven_amd_loader 001003 + +/** + * @brief Extension function table version 1.00. + */ +typedef struct hsa_ven_amd_loader_1_00_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); +} hsa_ven_amd_loader_1_00_pfn_t; + +/** + * @brief Extension function table version 1.01. + */ +typedef struct hsa_ven_amd_loader_1_01_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); +} hsa_ven_amd_loader_1_01_pfn_t; + +/** + * @brief Extension function table version 1.02. + */ +typedef struct hsa_ven_amd_loader_1_02_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + + hsa_status_t + (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); +} hsa_ven_amd_loader_1_02_pfn_t; + +/** + * @brief Extension function table version 1.03. + */ +typedef struct hsa_ven_amd_loader_1_03_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + + hsa_status_t + (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); + + hsa_status_t + (*hsa_ven_amd_loader_iterate_executables)( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); +} hsa_ven_amd_loader_1_03_pfn_t; + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* HSA_VEN_AMD_LOADER_H */ diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_pc_sampling.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_pc_sampling.h new file mode 100644 index 0000000000..019f0ea5c9 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_pc_sampling.h @@ -0,0 +1,416 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_VEN_AMD_PC_SAMPLING_H +#define HSA_VEN_AMD_PC_SAMPLING_H + +#include "hsa.h" + +#ifdef __cplusplus +extern "C" { +#endif /*__cplusplus*/ + + +/** + * @brief HSA AMD Vendor PC Sampling APIs + * EXPERIMENTAL: All PC Sampling APIs are currently in an experimental phase and the APIs may be + * modified extensively in the future + */ + +/** + * @brief PC Sampling sample data for hosttrap sampling method + */ +typedef struct { + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroup_id_x; + uint32_t workgroup_id_y; + uint32_t workgroup_id_z; + uint32_t wave_in_wg : 6; + uint32_t chiplet : 3; // Currently not used + uint32_t reserved : 23; + uint32_t hw_id; + uint32_t reserved0; + uint64_t reserved1; + uint64_t timestamp; + uint64_t correlation_id; +} perf_sample_hosttrap_v1_t; + +/** + * @brief PC Sampling sample data for stochastic sampling method + */ +typedef struct { + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroup_id_x; + uint32_t workgroup_id_y; + uint32_t workgroup_id_z; + uint32_t wave_in_wg : 6; + uint32_t chiplet : 3; // Currently not used + uint32_t reserved : 23; + uint32_t hw_id; + uint32_t perf_snapshot_data; + uint32_t perf_snapshot_data1; + uint32_t perf_snapshot_data2; + uint64_t timestamp; + uint64_t correlation_id; +} perf_sample_snapshot_v1_t; + +/** + * @brief PC Sampling method kinds + */ +typedef enum { + HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1, + HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1 +} hsa_ven_amd_pcs_method_kind_t; + +/** + * @brief PC Sampling interval unit type + */ +typedef enum { + HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS, + HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES, + HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS +} hsa_ven_amd_pcs_units_t; + +/** + * @brief HSA callback function to perform the copy onto a destination buffer + * + * If data_size is 0, HSA will stop current copy operation and keep remaining data in internal + * buffers. Remaining contents of HSA internal buffers will be included in next + * hsa_ven_amd_pcs_data_ready_callback_t. HSA internal buffers can also be drained by calling + * hsa_ven_amd_pcs_flush. + * + * @param[in] hsa_callback_data private data to pass back to HSA. Provided in + * hsa_ven_amd_pcs_data_ready_callback_t + * + * @param[in] data_size size of destination buffer in bytes. + * @param[in] destination destination buffer + * @retval TBD: but could be used to indicate that there is no more data to be read. + * Or indicate an error and abort of current copy operations + */ +typedef hsa_status_t (*hsa_ven_amd_pcs_data_copy_callback_t)(void* hsa_callback_data, + size_t data_size, void* destination); + +/** + * @brief HSA callback function to to indicate that there is data ready to be copied + * + * When the client receives this callback, the client should call back @p data_copy_callback for HSA + * to perform the copy operation into an available buffer. @p data_copy_callback can be called back + * multiple times with smaller @p data_size to split the copy operation. + * + * This callback must not call ::hsa_ven_amd_pcs_flush. + * + * @param[in] client_callback_data client private data passed in via + * hsa_ven_amd_pcs_create/hsa_ven_amd_pcs_create_from_id + * @param[in] data_size size of data available to be copied + * @param[in] lost_sample_count number of lost samples since last call to + * hsa_ven_amd_pcs_data_ready_callback_t. + * @param[in] data_copy_callback callback function for HSA to perform the actual copy + * @param[in] hsa_callback_data private data to pass back to HSA + */ +typedef void (*hsa_ven_amd_pcs_data_ready_callback_t)( + void* client_callback_data, size_t data_size, size_t lost_sample_count, + hsa_ven_amd_pcs_data_copy_callback_t data_copy_callback, void* hsa_callback_data); + +/** + * @brief Opaque handle representing a sampling session. + * Two sessions having same handle value represent the same session + */ +typedef struct { + uint64_t handle; +} hsa_ven_amd_pcs_t; + +/** + * @brief PC Sampling configuration flag options + */ +typedef enum { + /* The interval for this sampling method have to be a power of 2 */ + HSA_VEN_AMD_PCS_CONFIGURATION_FLAGS_INTERVAL_POWER_OF_2 = (1 << 0) +} hsa_ven_amd_pcs_configuration_flags_t; + +/** + * @brief PC Sampling method information + * Used to provide client with list of supported PC Sampling methods + */ +typedef struct { + hsa_ven_amd_pcs_method_kind_t method; + hsa_ven_amd_pcs_units_t units; + size_t min_interval; + size_t max_interval; + uint64_t flags; +} hsa_ven_amd_pcs_configuration_t; + +/** + * @brief Callback function to iterate through list of supported PC Sampling configurations + * + * @param[in] configuration one entry for supported PC Sampling method and configuration options + * @param[in] callback_data client private callback data that was passed in when calling + * hsa_ven_amd_pcs_iterate_configuration + */ +typedef hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration_callback_t)( + const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data); + +/** + * @brief Iterate through list of current supported PC Sampling configurations for this @p agent + * + * HSA will callback @p configuration_callback for each currently available PC Sampling + * configuration. The list of currently available configurations may not be the complete list of + * configurations supported on the @p agent. The list of currently available configurations may be + * reduced if the @p agent is currently handling other PC sampling sessions. + * + * @param[in] agent target agent + * @param[in] configuration_callback callback function to iterate through list of configurations + * @param[in] callback_data client private callback data + **/ +hsa_status_t hsa_ven_amd_pcs_iterate_configuration( + hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data); + +/** + * @brief Create a PC Sampling session on @p agent + * + * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval + * parameters must be a legal configuration value, as described by the + * hsa_ven_amd_pcs_configuration_t configurations passed to the callbacks of + * hsa_ven_amd_pcs_iterate_configuration for this @p agent. + * A successfull call may restrict the list of possible PC sampling methods available to subsequent + * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations + * on what types of PC sampling they can perform concurrently. + * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session. + * The session will be in a stopped/inactive state after this call + * + * @param[in] agent target agent + * @param[in] method method to use + * @param[in] units sampling units + * @param[in] interval sampling interval in @p units + * @param[in] latency expected latency in microseconds for client to provide a buffer for the data + * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the + * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate + * how many samples are received within @p latency and call @p data_ready_callback ahead of time so + * that the client has @p latency time to allocate the buffer before the HSA-runtime internal + * buffers are full. The value of latency can be 0. + * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once + * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of + * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t. + * @param[in] data_ready_callback client callback function that will be called when: + * 1. There is enough samples fill a buffer with @p buffer_size - estimated samples received + * within @p latency period. + * OR + * 2. When hsa_ven_amd_pcs_flush is called. + * @param[in] client_callback_data client private data to be provided back when data_ready_callback + * is called. + * @param[out] pc_sampling PC sampling session handle used to reference this session when calling + * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy + * + * @retval ::HSA_STATUS_SUCCESS session created successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters + * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and + * cannot handle the type requested. + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources + * @retval ::HSA_STATUS_ERROR Unexpected error + **/ +hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling); + + +/** + * @brief Creates a PC Sampling session on @p agent. Assumes that the caller provides the + * @p pcs_id generated by the previous call to the underlying driver that reserved PC sampling + * on the @p agent. + * + * Similar to the @ref hsa_ven_amd_pcs_create with the difference that it inherits an existing + * PC sampling session that was previously created in the underlying driver. + * + * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval + * parameters must be a legal configuration value, and match the parameters that we used to create + * the underlying PC Sampling session in the underlying driver. + * A successfull call may restrict the list of possible PC sampling methods available to subsequent + * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations + * on what types of PC sampling they can perform concurrently. + * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session. + * The session will be in a stopped/inactive state after this call + * + * @param[in] pcs_id ID that uniquely identifies the PC sampling session within underlying driver + * @param[in] agent target agent + * @param[in] method method to use + * @param[in] units sampling units + * @param[in] interval sampling interval in @p units + * @param[in] latency expected latency in microseconds for client to provide a buffer for the data + * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the + * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate + * how many samples are received within @p latency and call @p data_ready_callback ahead of time so + * that the client has @p latency time to allocate the buffer before the HSA-runtime internal + * buffers are full. The value of latency can be 0. + * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once + * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of + * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t. + * @param[in] data_ready_callback client callback function that will be called when: + * 1. There is enough samples fill a buffer with @p buffer_size - estimated samples received + * within @p latency period. + * OR + * 2. When hsa_ven_amd_pcs_flush is called. + * @param[in] client_callback_data client private data to be provided back when data_ready_callback + * is called. + * @param[out] pc_sampling PC sampling session handle used to reference this session when calling + * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy + * + * @retval ::HSA_STATUS_SUCCESS session created successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters + * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and + * cannot handle the type requested. + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources + * @retval ::HSA_STATUS_ERROR Unexpected error + **/ +hsa_status_t hsa_ven_amd_pcs_create_from_id( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + +/** + * @brief Free a PC Sampling session on @p agent + * + * Free all the resources allocated for a PC Sampling session on @p agent + * Internal buffers for this session will be lost. + * If the session was active, the session will be stopped before it is destroyed. + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session destroyed successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + * @retval ::HSA_STATUS_ERROR unexpected error + */ +hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Start a PC Sampling session + * + * Activate a PC Sampling session that was previous created. + * The session with be in a active state after this call + * If the session was already active, this will result in a no-op and will return HSA_STATUS_SUCCESS + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session started successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + * @retval ::HSA_STATUS_ERROR unexpected error + */ +hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Stop a PC Sampling session + * + * Stop a session that is currently active + * After a session is stopped HSA may still have some PC Sampling data in its internal buffers. + * The internal buffers can be drained using hsa_ven_amd_pcs_flush. If the internal + * buffers are not drained and the session is started again, the internal buffers will be available + * on the next data_ready_callback. + * If the session was already inactive, this will result in a no-op and will return + * HSA_STATUS_SUCCESS + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session stopped successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + */ +hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Flush internal buffers for a PC Sampling session + * + * Drain internal buffers for a PC Sampling session. If internal buffers have available data, + * this trigger a data_ready_callback. + * + * The function blocks until all PC samples associated with the @p pc_sampling session + * generated prior to the function call have been communicated by invocations of + * @p data_ready_callback having completed execution. + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session flushed successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + */ +hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling); + +#define hsa_ven_amd_pc_sampling_1_00 + +/** + * @brief The function pointer table for the PC Sampling v1.00 extension. Can be returned by + * ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ven_amd_pc_sampling_1_00_pfn_t { + hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration)( + hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data); + + hsa_status_t (*hsa_ven_amd_pcs_create)(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_create_from_id)( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_destroy)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_start)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_stop)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_flush)(hsa_ven_amd_pcs_t pc_sampling); + +} hsa_ven_amd_pc_sampling_1_00_pfn_t; + +#ifdef __cplusplus +} // end extern "C" block +#endif /*__cplusplus*/ + +#endif /* HSA_VEN_AMD_PC_SAMPLING_H */ diff --git a/projects/rocr-runtime/libhsakmt/include/impl/pm4_cmds.h b/projects/rocr-runtime/libhsakmt/include/impl/pm4_cmds.h new file mode 100644 index 0000000000..44b7fb00aa --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/pm4_cmds.h @@ -0,0 +1,1090 @@ +#ifndef _WSL_INC_PM4_CMDS_H_ +#define _WSL_INC_PM4_CMDS_H_ + +#include + +#define mmCOMPUTE_NUM_THREAD_X 0x2E07 +#define mmCOMPUTE_PGM_LO 0x2E0C +#define mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO 0x2E10 +#define mmCOMPUTE_PGM_RSRC1 0x2E12 +#define mmCOMPUTE_PGM_RSRC3 0x2E28 +#define mmCOMPUTE_RESOURCE_LIMITS 0x2E15 +#define mmCOMPUTE_USER_DATA_0 0x2E40 + +#define PM4_TYPE_SHIFT 30 +#define PM4_COUNT_SHIFT 16 +#define PM4_OPCODE_SHIFT 8 +#define PM4_SHADER_TYPE_SHIFT 1 + +#define PM4_GFX_SHADER 0 +#define PM4_COMPUTE_SHADER 1 + +#define PM4_TYPE3_HDR(_opc_, _count_) \ + (uint32_t)((3) << PM4_TYPE_SHIFT | \ + ((_count_) - 2) << PM4_COUNT_SHIFT | \ + (_opc_) << PM4_OPCODE_SHIFT) | \ + (PM4_COMPUTE_SHADER << PM4_SHADER_TYPE_SHIFT) + +union PM4_MEC_TYPE_3_HEADER { + struct { + uint32_t reserved1 : 8; ///< reserved + uint32_t opcode : 8; ///< IT opcode + uint32_t count : 14;///< number of DWORDs - 1 in the information body. + uint32_t type : 2; ///< packet identifier. It should be 3 for type 3 packets + }; + uint32_t u32All; +}; + +#define IT_DISPATCH_DIRECT 0x15 +#define IT_ATOMIC_MEM 0x1E +#define IT_WRITE_DATA 0x37 +#define IT_INDIRECT_BUFFER 0x3F +#define IT_COPY_DATA 0x40 +#define IT_EVENT_WRITE 0x46 +#define IT_RELEASE_MEM 0x49 +#define IT_ACQUIRE_MEM 0x58 +#define IT_SET_SH_REG 0x76 + +struct PM4_MEC_SET_SH_REG { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t reg_offset:16; + uint32_t reserved1:16; + } bitfields2; + uint32_t ordinal2; + }; +}; + +struct PM4_MEC_DISPATCH_DIRECT { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + uint32_t dim_x; + uint32_t dim_y; + uint32_t dim_z; + uint32_t dispatch_initiator; +}; + +// ------------------------------- MEC_EVENT_WRITE_event_index_enum ------------------------------- +enum MEC_EVENT_WRITE_event_index_enum { + event_index__mec_event_write__other = 0, + event_index__mec_event_write__sample_pipelinestat = 2, + event_index__mec_event_write__cs_partial_flush = 4, + event_index__mec_event_write__sample_streamoutstats__GFX11 = 8, + event_index__mec_event_write__sample_streamoutstats1__GFX11 = 9, + event_index__mec_event_write__sample_streamoutstats2__GFX11 = 10, + event_index__mec_event_write__sample_streamoutstats3__GFX11 = 11, +}; + +enum VGT_EVENT_TYPE { + Reserved_0x00 = 0x00000000, + SAMPLE_STREAMOUTSTATS1 = 0x00000001, + SAMPLE_STREAMOUTSTATS2 = 0x00000002, + SAMPLE_STREAMOUTSTATS3 = 0x00000003, + CACHE_FLUSH_TS = 0x00000004, + CONTEXT_DONE = 0x00000005, + CACHE_FLUSH = 0x00000006, + CS_PARTIAL_FLUSH = 0x00000007, + VGT_STREAMOUT_SYNC = 0x00000008, + VGT_STREAMOUT_RESET = 0x0000000a, + END_OF_PIPE_INCR_DE = 0x0000000b, + END_OF_PIPE_IB_END = 0x0000000c, + RST_PIX_CNT = 0x0000000d, + BREAK_BATCH = 0x0000000e, + VS_PARTIAL_FLUSH = 0x0000000f, + PS_PARTIAL_FLUSH = 0x00000010, + FLUSH_HS_OUTPUT = 0x00000011, + FLUSH_DFSM = 0x00000012, + RESET_TO_LOWEST_VGT = 0x00000013, + CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014, + CACHE_FLUSH_AND_INV_EVENT = 0x00000016, + PERFCOUNTER_START = 0x00000017, + PERFCOUNTER_STOP = 0x00000018, + PIPELINESTAT_START = 0x00000019, + PIPELINESTAT_STOP = 0x0000001a, + PERFCOUNTER_SAMPLE = 0x0000001b, + SAMPLE_PIPELINESTAT = 0x0000001e, + SO_VGTSTREAMOUT_FLUSH = 0x0000001f, + SAMPLE_STREAMOUTSTATS = 0x00000020, + RESET_VTX_CNT = 0x00000021, + BLOCK_CONTEXT_DONE = 0x00000022, + CS_CONTEXT_DONE = 0x00000023, + VGT_FLUSH = 0x00000024, + TGID_ROLLOVER = 0x00000025, + SQ_NON_EVENT = 0x00000026, + SC_SEND_DB_VPZ = 0x00000027, + BOTTOM_OF_PIPE_TS = 0x00000028, + FLUSH_SX_TS = 0x00000029, + DB_CACHE_FLUSH_AND_INV = 0x0000002a, + FLUSH_AND_INV_DB_DATA_TS = 0x0000002b, + FLUSH_AND_INV_DB_META = 0x0000002c, + FLUSH_AND_INV_CB_DATA_TS = 0x0000002d, + FLUSH_AND_INV_CB_META = 0x0000002e, + CS_DONE = 0x0000002f, + PS_DONE = 0x00000030, + FLUSH_AND_INV_CB_PIXEL_DATA = 0x00000031, + SX_CB_RAT_ACK_REQUEST = 0x00000032, + THREAD_TRACE_START = 0x00000033, + THREAD_TRACE_STOP = 0x00000034, + THREAD_TRACE_MARKER = 0x00000035, + THREAD_TRACE_FINISH = 0x00000037, + PIXEL_PIPE_STAT_CONTROL = 0x00000038, + PIXEL_PIPE_STAT_DUMP = 0x00000039, + PIXEL_PIPE_STAT_RESET = 0x0000003a, + CONTEXT_SUSPEND = 0x0000003b, + OFFCHIP_HS_DEALLOC = 0x0000003c, + ENABLE_NGG_PIPELINE = 0x0000003d, + SET_FE_ID__GFX09 = 0x00000009, + Available_0x1c__GFX09 = 0x0000001c, + Available_0x1d__GFX09 = 0x0000001d, + THREAD_TRACE_FLUSH__GFX09 = 0x00000036, + Reserved_0x3f__GFX09 = 0x0000003f, + ZPASS_DONE__GFX09_10 = 0x00000015, + ENABLE_LEGACY_PIPELINE__GFX09_10 = 0x0000003e, + Reserved_0x09__GFX10PLUS = 0x00000009, + FLUSH_ES_OUTPUT__GFX10PLUS = 0x0000001c, + BIN_CONF_OVERRIDE_CHECK__GFX10PLUS = 0x0000001d, + THREAD_TRACE_DRAW__GFX10PLUS = 0x00000036, + DRAW_DONE__GFX10PLUS = 0x0000003f, + WAIT_SYNC__GFX11 = 0x00000015, + ENABLE_PIPELINE_NOT_USED__GFX11 = 0x0000003e, +}; + +struct PM4_MEC_EVENT_WRITE { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t reserved2:19; + uint32_t offload_enable:1; + } bitfields2; + uint32_t ordinal2; + }; +}; + +struct PM4_MEC_ATOMIC_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t atomic:7; + uint32_t reserved1:1; + uint32_t command:4; + uint32_t reserved2:13; + uint32_t cache_policy:2; + uint32_t reserved3:5; + } bitfields2; + uint32_t ordinal2; + }; + uint32_t addr_lo; + uint32_t addr_hi; + uint32_t src_data_lo; + uint32_t src_data_hi; + uint32_t cmp_data_lo; + uint32_t cmp_data_hi; + union { + struct { + uint32_t loop_interval:13; + uint32_t reserved4:19; + } bitfields9; + uint32_t ordinal9; + }; +}; + +struct PM4_MEC_WRITE_DATA { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t reserved1:8; + uint32_t dst_sel:4; + uint32_t reserved2:4; + uint32_t addr_incr:1; + uint32_t reserved3:2; + uint32_t resume_vf:1; + uint32_t wr_confirm:1; + uint32_t reserved4:4; + uint32_t cache_policy:2; + uint32_t reserved5:5; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t dst_mmreg_addr:18; + uint32_t reserved6:14; + } bitfields3a; + struct { + uint32_t dst_gds_addr:16; + uint32_t reserved7:16; + } bitfields3b; + struct { + uint32_t reserved8:2; + uint32_t dst_mem_addr_lo:30; + } bitfields3c; + uint32_t ordinal3; + }; + uint32_t dst_mem_addr_hi; + uint64_t write_data_value; +}; + +#define PERSISTENT_SPACE_START 0x00002c00 + +template +void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) { + pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_SH_REG, + sizeof(T) / sizeof(uint32_t)); + pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - PERSISTENT_SPACE_START; +} + +template +void GenerateCmdHeader(T* pm4, int op_code) { + pm4->header.u32All = PM4_TYPE3_HDR(op_code, sizeof(T) / sizeof(uint32_t)); +} + +/// @brief Defines the Gpu command to dispatch a kernel. It embeds +/// various Gpu hardware specific data structures for initialization +/// and configuration before a dispatch begins to run +struct DispatchTemplate { + + /// @brief Structure used to initialize the group dimensions + /// of a kernel dispatch and if performance counters are enabled + struct DispatchDimensionRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_num_thread_x; + uint32_t compute_num_thread_y; + uint32_t compute_num_thread_z; + } dimension_regs; + + struct DispatchProgramRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_pgm_lo; + uint32_t compute_pgm_hi; + } program_regs; + + struct DispatchProgramResourceRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_pgm_rsrc1; + uint32_t compute_pgm_rsrc2; + } program_resource_regs; + + /// @brief Structure used to initialize parameters related to + /// thread management i.e. number of waves to issue and number + /// of Compute Units to use + struct DispatchResourceRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_resource_limits; + uint32_t compute_static_thread_mgmt_se0; + uint32_t compute_static_thread_mgmt_se1; + uint32_t compute_tmpring_size; + uint32_t compute_static_thread_mgmt_se2; + uint32_t compute_static_thread_mgmt_se3; + } resource_regs; + + /// @brief Structure used to pass handles of the Aql dispatch + /// packet, Aql queue, Kernel argument address block, Scratch + /// buffer + struct DispatchComputeUserDataRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_user_data[16]; + } compute_user_data_regs; + + /// @brief Structure used to configure Cache flush policy + /// and dimensions of total work size + PM4_MEC_DISPATCH_DIRECT dispatch_direct; +}; + +struct DispatchProgramResourceRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_pgm_rsrc3; +}; + + +/// @brief Structure used to issue a programing scratch command for gfx11+ +struct SetScratchTemplate { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t scratch_lo; + uint32_t scratch_hi; +}; + +/// @brief Structure used to issue a Gpu Barrier command +struct BarrierTemplate { + PM4_MEC_EVENT_WRITE event_write; +}; + +//--------------------MEC_ATOMIC_MEM-------------------- +enum MEC_ATOMIC_MEM_command_enum { + command__mec_atomic_mem__single_pass_atomic = 0, + command__mec_atomic_mem__loop_until_compare_satisfied = 1, + command__mec_atomic_mem__wait_for_write_confirmation = 2, + command__mec_atomic_mem__send_and_continue = 3, +}; + +enum MEC_ATOMIC_MEM_cache_policy_enum { + cache_policy__mec_atomic_mem__lru = 0, + cache_policy__mec_atomic_mem__stream = 1, + cache_policy__mec_atomic_mem__noa = 2, + cache_policy__mec_atomic_mem__bypass = 3, +}; + +enum TC_OP { + TC_OP_READ = 0x00000000, + TC_OP_ATOMIC_FCMPSWAP_RTN_32 = 0x00000001, + TC_OP_ATOMIC_FMIN_RTN_32 = 0x00000002, + TC_OP_ATOMIC_FMAX_RTN_32 = 0x00000003, + TC_OP_RESERVED_FOP_RTN_32_0 = 0x00000004, + TC_OP_RESERVED_FOP_RTN_32_2 = 0x00000006, + TC_OP_ATOMIC_SWAP_RTN_32 = 0x00000007, + TC_OP_ATOMIC_CMPSWAP_RTN_32 = 0x00000008, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_RTN_32 = 0x00000009, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_RTN_32 = 0x0000000a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_RTN_32 = 0x0000000b, + TC_OP_PROBE_FILTER = 0x0000000c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_32_2 = 0x0000000e, + TC_OP_ATOMIC_ADD_RTN_32 = 0x0000000f, + TC_OP_ATOMIC_SUB_RTN_32 = 0x00000010, + TC_OP_ATOMIC_SMIN_RTN_32 = 0x00000011, + TC_OP_ATOMIC_UMIN_RTN_32 = 0x00000012, + TC_OP_ATOMIC_SMAX_RTN_32 = 0x00000013, + TC_OP_ATOMIC_UMAX_RTN_32 = 0x00000014, + TC_OP_ATOMIC_AND_RTN_32 = 0x00000015, + TC_OP_ATOMIC_OR_RTN_32 = 0x00000016, + TC_OP_ATOMIC_XOR_RTN_32 = 0x00000017, + TC_OP_ATOMIC_INC_RTN_32 = 0x00000018, + TC_OP_ATOMIC_DEC_RTN_32 = 0x00000019, + TC_OP_WBINVL1_VOL = 0x0000001a, + TC_OP_WBINVL1_SD = 0x0000001b, + TC_OP_RESERVED_NON_FLOAT_RTN_32_0 = 0x0000001c, + TC_OP_RESERVED_NON_FLOAT_RTN_32_1 = 0x0000001d, + TC_OP_RESERVED_NON_FLOAT_RTN_32_2 = 0x0000001e, + TC_OP_RESERVED_NON_FLOAT_RTN_32_3 = 0x0000001f, + TC_OP_WRITE = 0x00000020, + TC_OP_ATOMIC_FCMPSWAP_RTN_64 = 0x00000021, + TC_OP_ATOMIC_FMIN_RTN_64 = 0x00000022, + TC_OP_ATOMIC_FMAX_RTN_64 = 0x00000023, + TC_OP_RESERVED_FOP_RTN_64_0 = 0x00000024, + TC_OP_RESERVED_FOP_RTN_64_1 = 0x00000025, + TC_OP_RESERVED_FOP_RTN_64_2 = 0x00000026, + TC_OP_ATOMIC_SWAP_RTN_64 = 0x00000027, + TC_OP_ATOMIC_CMPSWAP_RTN_64 = 0x00000028, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_RTN_64 = 0x00000029, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_RTN_64 = 0x0000002a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_RTN_64 = 0x0000002b, + TC_OP_WBINVL2_SD = 0x0000002c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_64_0 = 0x0000002d, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_64_1 = 0x0000002e, + TC_OP_ATOMIC_ADD_RTN_64 = 0x0000002f, + TC_OP_ATOMIC_SUB_RTN_64 = 0x00000030, + TC_OP_ATOMIC_SMIN_RTN_64 = 0x00000031, + TC_OP_ATOMIC_UMIN_RTN_64 = 0x00000032, + TC_OP_ATOMIC_SMAX_RTN_64 = 0x00000033, + TC_OP_ATOMIC_UMAX_RTN_64 = 0x00000034, + TC_OP_ATOMIC_AND_RTN_64 = 0x00000035, + TC_OP_ATOMIC_OR_RTN_64 = 0x00000036, + TC_OP_ATOMIC_XOR_RTN_64 = 0x00000037, + TC_OP_ATOMIC_INC_RTN_64 = 0x00000038, + TC_OP_ATOMIC_DEC_RTN_64 = 0x00000039, + TC_OP_WBL2_NC = 0x0000003a, + TC_OP_WBL2_WC = 0x0000003b, + TC_OP_RESERVED_NON_FLOAT_RTN_64_1 = 0x0000003c, + TC_OP_RESERVED_NON_FLOAT_RTN_64_2 = 0x0000003d, + TC_OP_RESERVED_NON_FLOAT_RTN_64_3 = 0x0000003e, + TC_OP_RESERVED_NON_FLOAT_RTN_64_4 = 0x0000003f, + TC_OP_WBINVL1 = 0x00000040, + TC_OP_ATOMIC_FCMPSWAP_32 = 0x00000041, + TC_OP_ATOMIC_FMIN_32 = 0x00000042, + TC_OP_ATOMIC_FMAX_32 = 0x00000043, + TC_OP_RESERVED_FOP_32_0 = 0x00000044, + TC_OP_RESERVED_FOP_32_2 = 0x00000046, + TC_OP_ATOMIC_SWAP_32 = 0x00000047, + TC_OP_ATOMIC_CMPSWAP_32 = 0x00000048, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_32 = 0x00000049, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_32 = 0x0000004a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_32 = 0x0000004b, + TC_OP_INV_METADATA = 0x0000004c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_32_2 = 0x0000004e, + TC_OP_ATOMIC_ADD_32 = 0x0000004f, + TC_OP_ATOMIC_SUB_32 = 0x00000050, + TC_OP_ATOMIC_SMIN_32 = 0x00000051, + TC_OP_ATOMIC_UMIN_32 = 0x00000052, + TC_OP_ATOMIC_SMAX_32 = 0x00000053, + TC_OP_ATOMIC_UMAX_32 = 0x00000054, + TC_OP_ATOMIC_AND_32 = 0x00000055, + TC_OP_ATOMIC_OR_32 = 0x00000056, + TC_OP_ATOMIC_XOR_32 = 0x00000057, + TC_OP_ATOMIC_INC_32 = 0x00000058, + TC_OP_ATOMIC_DEC_32 = 0x00000059, + TC_OP_INVL2_NC = 0x0000005a, + TC_OP_NOP_RTN0 = 0x0000005b, + TC_OP_RESERVED_NON_FLOAT_32_1 = 0x0000005c, + TC_OP_RESERVED_NON_FLOAT_32_2 = 0x0000005d, + TC_OP_RESERVED_NON_FLOAT_32_3 = 0x0000005e, + TC_OP_RESERVED_NON_FLOAT_32_4 = 0x0000005f, + TC_OP_WBINVL2 = 0x00000060, + TC_OP_ATOMIC_FCMPSWAP_64 = 0x00000061, + TC_OP_ATOMIC_FMIN_64 = 0x00000062, + TC_OP_ATOMIC_FMAX_64 = 0x00000063, + TC_OP_RESERVED_FOP_64_0 = 0x00000064, + TC_OP_RESERVED_FOP_64_1 = 0x00000065, + TC_OP_RESERVED_FOP_64_2 = 0x00000066, + TC_OP_ATOMIC_SWAP_64 = 0x00000067, + TC_OP_ATOMIC_CMPSWAP_64 = 0x00000068, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_64 = 0x00000069, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_64 = 0x0000006a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_64 = 0x0000006b, + TC_OP_RESERVED_FOP_FLUSH_DENORM_64_0 = 0x0000006c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_64_1 = 0x0000006d, + TC_OP_RESERVED_FOP_FLUSH_DENORM_64_2 = 0x0000006e, + TC_OP_ATOMIC_ADD_64 = 0x0000006f, + TC_OP_ATOMIC_SUB_64 = 0x00000070, + TC_OP_ATOMIC_SMIN_64 = 0x00000071, + TC_OP_ATOMIC_UMIN_64 = 0x00000072, + TC_OP_ATOMIC_SMAX_64 = 0x00000073, + TC_OP_ATOMIC_UMAX_64 = 0x00000074, + TC_OP_ATOMIC_AND_64 = 0x00000075, + TC_OP_ATOMIC_OR_64 = 0x00000076, + TC_OP_ATOMIC_XOR_64 = 0x00000077, + TC_OP_ATOMIC_INC_64 = 0x00000078, + TC_OP_ATOMIC_DEC_64 = 0x00000079, + TC_OP_WBINVL2_NC = 0x0000007a, + TC_OP_NOP_ACK = 0x0000007b, + TC_OP_RESERVED_NON_FLOAT_64_1 = 0x0000007c, + TC_OP_RESERVED_NON_FLOAT_64_2 = 0x0000007d, + TC_OP_RESERVED_NON_FLOAT_64_3 = 0x0000007e, + TC_OP_RESERVED_NON_FLOAT_64_4 = 0x0000007f, + TC_OP_RESERVED_FOP_RTN_32_1__GFX09_10 = 0x00000005, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_32_1__GFX09_10 = 0x0000000d, + TC_OP_RESERVED_FOP_32_1__GFX09_10 = 0x00000045, + TC_OP_RESERVED_FOP_FLUSH_DENORM_32_1__GFX09_10 = 0x0000004d, + TC_OP_RESERVED_FADD_RTN_32__GFX11 = 0x00000005, + TC_OP_ATOMIC_FADD_FLUSH_DENORM_RTN_32__GFX11 = 0x0000000d, + TC_OP_RESERVED_FADD_32__GFX11 = 0x00000045, + TC_OP_ATOMIC_FADD_FLUSH_DENORM_32__GFX11 = 0x0000004d, +}; + +// Desc: Strucuture used to perform various atomic +// operations - add, subtract, increment, etc +struct AtomicTemplate { + PM4_MEC_ATOMIC_MEM atomic; +}; + +/// @brief PM4 command to write a 64-bit value into a memory +/// location accessible to Gpu +struct WriteDataTemplate { + PM4_MEC_WRITE_DATA write_data; +}; + +// ---------------------------------- MEC_COPY_DATA_src_sel_enum ---------------------------------- +enum MEC_COPY_DATA_src_sel_enum { + src_sel__mec_copy_data__mem_mapped_register = 0, + src_sel__mec_copy_data__tc_l2_obsolete = 1, + src_sel__mec_copy_data__tc_l2 = 2, + src_sel__mec_copy_data__gds = 3, + src_sel__mec_copy_data__perfcounters = 4, + src_sel__mec_copy_data__immediate_data = 5, + src_sel__mec_copy_data__atomic_return_data = 6, + src_sel__mec_copy_data__gds_atomic_return_data0 = 7, + src_sel__mec_copy_data__gds_atomic_return_data1 = 8, + src_sel__mec_copy_data__gpu_clock_count = 9, + src_sel__mec_copy_data__system_clock_count = 10, + src_sel__mec_copy_data__ext32perfcntr = 11, +}; + +// ---------------------------------- MEC_COPY_DATA_dst_sel_enum ---------------------------------- +enum MEC_COPY_DATA_dst_sel_enum { + dst_sel__mec_copy_data__mem_mapped_register = 0, + dst_sel__mec_copy_data__tc_l2 = 2, + dst_sel__mec_copy_data__gds = 3, + dst_sel__mec_copy_data__perfcounters = 4, + dst_sel__mec_copy_data__tc_l2_obsolete = 5, + dst_sel__mec_copy_data__mem_mapped_reg_dc = 6, + dst_sel__mec_copy_data__ext32perfcntr = 11, +}; + +// ------------------------------ MEC_COPY_DATA_src_cache_policy_enum ------------------------------ +enum MEC_COPY_DATA_src_cache_policy_enum { + src_cache_policy__mec_copy_data__lru = 0, + src_cache_policy__mec_copy_data__stream = 1, + src_cache_policy__mec_copy_data__noa = 2, + src_cache_policy__mec_copy_data__bypass = 3, +}; + +// --------------------------------- MEC_COPY_DATA_count_sel_enum --------------------------------- +enum MEC_COPY_DATA_count_sel_enum { + count_sel__mec_copy_data__32_bits_of_data = 0, + count_sel__mec_copy_data__64_bits_of_data = 1, +}; + +// --------------------------------- MEC_COPY_DATA_wr_confirm_enum --------------------------------- +enum MEC_COPY_DATA_wr_confirm_enum { + wr_confirm__mec_copy_data__do_not_wait_for_confirmation = 0, + wr_confirm__mec_copy_data__wait_for_confirmation = 1, +}; + +// ------------------------------ MEC_COPY_DATA_dst_cache_policy_enum ------------------------------ +enum MEC_COPY_DATA_dst_cache_policy_enum { + dst_cache_policy__mec_copy_data__lru = 0, + dst_cache_policy__mec_copy_data__stream = 1, + dst_cache_policy__mec_copy_data__noa = 2, + dst_cache_policy__mec_copy_data__bypass = 3, +}; + +// ------------------------------- MEC_COPY_DATA_pq_exe_status_enum ------------------------------- +enum MEC_COPY_DATA_pq_exe_status_enum { + pq_exe_status__mec_copy_data__default = 0, + pq_exe_status__mec_copy_data__phase_update = 1, +}; + +// ------------------------------- MEC_WRITE_DATA_dst_sel_enum ------------------------------- +enum MEC_WRITE_DATA_dst_sel_enum { + dst_sel__mec_write_data__mem_mapped_register = 0, + dst_sel__mec_write_data__tc_l2 = 2, + dst_sel__mec_write_data__gds = 3, + dst_sel__mec_write_data__memory = 5, + dst_sel__mec_write_data__memory_mapped_adc_persistent_state = 6 }; + +// ------------------------------- MEC_WRITE_DATA_addr_incr_enum ------------------------------- +enum MEC_WRITE_DATA_addr_incr_enum { + addr_incr__mec_write_data__increment_address = 0, + addr_incr__mec_write_data__do_not_increment_address = 1 }; + +// ------------------------------- MEC_WRITE_DATA_wr_confirm_enum ------------------------------- +enum MEC_WRITE_DATA_wr_confirm_enum { + wr_confirm__mec_write_data__do_not_wait_for_write_confirmation = 0, + wr_confirm__mec_write_data__wait_for_write_confirmation = 1 }; + +// ------------------------------- MEC_WRITE_DATA_cache_policy_enum ------------------------------- +enum MEC_WRITE_DATA_cache_policy_enum { + cache_policy__mec_write_data__lru = 0, + cache_policy__mec_write_data__stream = 1, + cache_policy__mec_write_data__noa = 2, + cache_policy__mec_write_data__bypass = 3 }; + +typedef struct PM4_MEC_COPY_DATA { + union { + PM4_MEC_TYPE_3_HEADER header; /// header + uint32_t ordinal1; + }; + union { + struct { + uint32_t src_sel : 4; + uint32_t reserved1 : 4; + uint32_t dst_sel : 4; + uint32_t reserved2 : 1; + uint32_t src_cache_policy : 2; + uint32_t reserved3 : 1; + uint32_t count_sel : 1; + uint32_t reserved4 : 3; + uint32_t wr_confirm : 1; + uint32_t reserved5 : 4; + uint32_t dst_cache_policy : 2; + uint32_t reserved6 : 2; + uint32_t pq_exe_status : 1; + uint32_t reserved7 : 2; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t src_reg_offset : 18; + uint32_t reserved8 : 14; + } bitfields3a; + struct { + uint32_t reserved9 : 2; + uint32_t src_32b_addr_lo : 30; + } bitfields3b; + struct { + uint32_t reserved10 : 3; + uint32_t src_64b_addr_lo : 29; + } bitfields3c; + struct { + uint32_t src_gds_addr_lo : 16; + uint32_t reserved11 : 16; + } bitfields3d; + uint32_t imm_data; + uint32_t ordinal3; + }; + union { + uint32_t src_memtc_addr_hi; + uint32_t src_imm_data; + uint32_t ordinal4; + }; + union { + struct { + uint32_t dst_reg_offset : 18; + uint32_t reserved12 : 14; + } bitfields5a; + struct { + uint32_t reserved13 : 2; + uint32_t dst_32b_addr_lo : 30; + } bitfields5b; + struct { + uint32_t reserved14 : 3; + uint32_t dst_64b_addr_lo : 29; + } bitfields5c; + struct { + uint32_t dst_gds_addr_lo : 16; + uint32_t reserved15 : 16; + } bitfields5d; + uint32_t ordinal5; + }; + uint32_t dst_addr_hi; +} PM4MEC_COPY_DATA; +namespace gfx9 { + +struct PM4_MEC_ACQUIRE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t coher_cntl:31; + uint32_t reserved1:1; + } bitfields2; + uint32_t ordinal2; + }; + uint32_t coher_size; + union { + struct { + uint32_t coher_size_hi:8; + uint32_t reserved2:24; + } bitfields4; + uint32_t ordinal4; + }; + uint32_t coher_base_lo; + union { + struct { + uint32_t coher_base_hi:24; + uint32_t reserved3:8; + } bitfields6; + uint32_t ordinal6; + }; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved4:16; + } bitfields7; + uint32_t ordinal7; + }; +}; + +struct PM4_MEC_RELEASE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t tcl1_vol_action_ena:1; + uint32_t tc_vol_action_ena:1; + uint32_t reserved2:1; + uint32_t tc_wb_action_ena:1; + uint32_t tcl1_action_ena:1; + uint32_t tc_action_ena:1; + uint32_t reserved3:1; + uint32_t tc_nc_action_ena:1; + uint32_t tc_wc_action_ena:1; + uint32_t tc_md_action_ena:1; + uint32_t reserved4:3; + uint32_t cache_policy:2; + uint32_t reserved5:2; + uint32_t pq_exe_status:1; + uint32_t reserved6:2; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved7:16; + uint32_t dst_sel:2; + uint32_t reserved8:6; + uint32_t int_sel:3; + uint32_t reserved9:2; + uint32_t data_sel:3; + } bitfields3; + uint32_t ordinal3; + }; + union { + struct { + uint32_t reserved10:2; + uint32_t address_lo_32b:30; + } bitfields4a; + struct { + uint32_t reserved11:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved12; + uint32_t ordinal4; + }; + union { + uint32_t address_hi; + uint32_t reserved13; + uint32_t ordinal5; + }; + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved14; + uint32_t ordinal6; + }; + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved15; + uint32_t reserved16; + uint32_t ordinal7; + }; + uint32_t int_ctxid; +}; + +struct PM4_MEC_WAIT_REG_MEM64 { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t function:3; + uint32_t reserved1:1; + uint32_t mem_space:2; + uint32_t operation:2; + uint32_t reserved2:24; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved3:3; + uint32_t mem_poll_addr_lo:29; + } bitfields3a; + struct { + uint32_t reg_poll_addr:18; + uint32_t reserved4:14; + } bitfields3b; + struct { + uint32_t reg_write_addr1:18; + uint32_t reserved5:14; + } bitfields3c; + uint32_t ordinal3; + }; + union { + uint32_t mem_poll_addr_hi; + struct { + uint32_t reg_write_addr2:18; + uint32_t reserved6:14; + } bitfields4b; + uint32_t ordinal4; + }; + uint32_t reference; + uint32_t reference_hi; + uint32_t mask; + uint32_t mask_hi; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved7:16; + } bitfields9; + uint32_t ordinal9; + }; +}; + +/// @brief Structure used to configure the flushing of +/// various caches - instruction, constants, L1 and L2 +struct AcquireMemTemplate { + PM4_MEC_ACQUIRE_MEM acquire_mem; +}; + +struct EndofKernelNotifyTemplate { + PM4_MEC_RELEASE_MEM release_mem; +}; + +/// @brief PM4 command to wait for a certain event before proceeding +/// to process another command on the queue +struct WaitRegMem64Template { + PM4_MEC_WAIT_REG_MEM64 wait_reg_mem; +}; + +} // gfx9 namespace + +namespace gfx10 { + +struct PM4_MEC_ACQUIRE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + uint32_t reserved1; + uint32_t coher_size; + union { + struct { + uint32_t coher_size_hi:8; + uint32_t reserved2:24; + } bitfields4; + uint32_t ordinal4; + }; + uint32_t coher_base_lo; + union { + struct { + uint32_t coher_base_hi:24; + uint32_t reserved3:8; + } bitfields6; + uint32_t ordinal6; + }; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved4:16; + } bitfields7; + uint32_t ordinal7; + }; + union { + struct { + uint32_t gcr_cntl:19; + uint32_t reserved4:13; + } bitfields8; + uint32_t ordinal8; + }; +}; + +struct PM4_MEC_RELEASE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t gcr_cntl:12; + uint32_t reserved2:1; + uint32_t cache_policy:2; + uint32_t reserved3:2; + uint32_t pq_exe_status:1; + uint32_t reserved4:2; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved7:16; + uint32_t dst_sel:2; + uint32_t reserved8:2; + uint32_t mes_intr_pipe:2; + uint32_t mes_action_id:2; + uint32_t int_sel:3; + uint32_t reserved9:2; + uint32_t data_sel:3; + } bitfields3; + uint32_t ordinal3; + }; + union { + struct { + uint32_t reserved10:2; + uint32_t address_lo_32b:30; + } bitfields4a; + struct { + uint32_t reserved11:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved12; + uint32_t ordinal4; + }; + union { + uint32_t address_hi; + uint32_t reserved13; + uint32_t ordinal5; + }; + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved14; + uint32_t ordinal6; + }; + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved15; + uint32_t reserved16; + uint32_t ordinal7; + }; + uint32_t int_ctxid; +}; + +struct PM4_MEC_WAIT_REG_MEM64 { + union { + PM4_MEC_TYPE_3_HEADER header; ///header + uint32_t ordinal1; + }; + union { + struct { + uint32_t function:3; + uint32_t reserved1:1; + uint32_t mem_space:2; + uint32_t operation:2; + uint32_t reserved2:14; + uint32_t mes_intr_pipe:2; + uint32_t mes_action:1; + uint32_t cache_policy:2; + uint32_t reserved3:5; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved4:3; + uint32_t mem_poll_addr_lo:29; + } bitfields3a; + struct { + uint32_t reg_poll_addr:18; + uint32_t reserved5:14; + } bitfields3b; + struct { + uint32_t reg_write_addr1:18; + uint32_t reserved6:14; + } bitfields3c; + uint32_t ordinal3; + }; + union { + uint32_t mem_poll_addr_hi; + struct { + uint32_t reg_write_addr2:18; + uint32_t reserved7:14; + } bitfields4b; + uint32_t ordinal4; + }; + uint32_t reference; + uint32_t reference_hi; + uint32_t mask; + uint32_t mask_hi; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved8:15; + uint32_t optimize_ace_offload_mode:1; + } bitfields9; + uint32_t ordinal9; + }; +}; + +/// @brief Structure used to configure the flushing of +/// various caches - instruction, constants, L1 and L2 +struct AcquireMemTemplate { + PM4_MEC_ACQUIRE_MEM acquire_mem; +}; + +struct EndofKernelNotifyTemplate { + PM4_MEC_RELEASE_MEM release_mem; +}; + +struct WaitRegMem64Template { + PM4_MEC_WAIT_REG_MEM64 wait_reg_mem; +}; + +} // gfx10 namespace + +namespace gfx11 { + +struct PM4_MEC_RELEASE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t gcr_cntl:13; + uint32_t cache_policy:2; + uint32_t reserved2:1; + uint32_t pq_exe_status:1; + uint32_t reserved3:1; + uint32_t glk_inv:1; + uint32_t reserved4:1; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved5:16; + uint32_t dst_sel:2; + uint32_t reserved6:2; + uint32_t mes_intr_pipe:2; + uint32_t mes_action_id:2; + uint32_t int_sel:3; + uint32_t reserved7:2; + uint32_t data_sel:3; + } bitfields3; + uint32_t ordinal3; + }; + union { + struct { + uint32_t reserved8:2; + uint32_t address_lo_32b:30; + } bitfields4a; + struct { + uint32_t reserved9:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved10; + uint32_t ordinal4; + }; + union { + uint32_t address_hi; + uint32_t reserved11; + uint32_t ordinal5; + }; + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved12; + uint32_t ordinal6; + }; + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved13; + uint32_t reserved14; + uint32_t ordinal7; + }; + uint32_t int_ctxid; +}; + +struct EndofKernelNotifyTemplate { + PM4_MEC_RELEASE_MEM release_mem; +}; + +} // gfx11 namespace + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/registers.h b/projects/rocr-runtime/libhsakmt/include/impl/registers.h new file mode 100644 index 0000000000..4d430b41e4 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/registers.h @@ -0,0 +1,363 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// This file is used only for open source cmake builds, if we hardcode the +// register values in amd_aql_queue.cpp then this file won't be required. For +// now we are using this file where register details are spelled out in the +// structs/unions below. +#ifndef _WSL_INC_REGISTERS_H_ +#define _WSL_INC_REGISTERS_H_ + +typedef enum SQ_RSRC_BUF_TYPE { +SQ_RSRC_BUF = 0x00000000, +SQ_RSRC_BUF_RSVD_1 = 0x00000001, +SQ_RSRC_BUF_RSVD_2 = 0x00000002, +SQ_RSRC_BUF_RSVD_3 = 0x00000003, +} SQ_RSRC_BUF_TYPE; + +typedef enum BUF_DATA_FORMAT { +BUF_DATA_FORMAT_INVALID = 0x00000000, +BUF_DATA_FORMAT_8 = 0x00000001, +BUF_DATA_FORMAT_16 = 0x00000002, +BUF_DATA_FORMAT_8_8 = 0x00000003, +BUF_DATA_FORMAT_32 = 0x00000004, +BUF_DATA_FORMAT_16_16 = 0x00000005, +BUF_DATA_FORMAT_10_11_11 = 0x00000006, +BUF_DATA_FORMAT_11_11_10 = 0x00000007, +BUF_DATA_FORMAT_10_10_10_2 = 0x00000008, +BUF_DATA_FORMAT_2_10_10_10 = 0x00000009, +BUF_DATA_FORMAT_8_8_8_8 = 0x0000000a, +BUF_DATA_FORMAT_32_32 = 0x0000000b, +BUF_DATA_FORMAT_16_16_16_16 = 0x0000000c, +BUF_DATA_FORMAT_32_32_32 = 0x0000000d, +BUF_DATA_FORMAT_32_32_32_32 = 0x0000000e, +BUF_DATA_FORMAT_RESERVED_15 = 0x0000000f, +} BUF_DATA_FORMAT; + +typedef enum BUF_NUM_FORMAT { +BUF_NUM_FORMAT_UNORM = 0x00000000, +BUF_NUM_FORMAT_SNORM = 0x00000001, +BUF_NUM_FORMAT_USCALED = 0x00000002, +BUF_NUM_FORMAT_SSCALED = 0x00000003, +BUF_NUM_FORMAT_UINT = 0x00000004, +BUF_NUM_FORMAT_SINT = 0x00000005, +BUF_NUM_FORMAT_SNORM_OGL__SI__CI = 0x00000006, +BUF_NUM_FORMAT_RESERVED_6__VI = 0x00000006, +BUF_NUM_FORMAT_FLOAT = 0x00000007, +} BUF_NUM_FORMAT; + +typedef enum BUF_FORMAT { +BUF_FORMAT_32_UINT = 0x00000014, +} BUF_FORMAT; + +typedef enum SQ_SEL_XYZW01 { +SQ_SEL_0 = 0x00000000, +SQ_SEL_1 = 0x00000001, +SQ_SEL_RESERVED_0 = 0x00000002, +SQ_SEL_RESERVED_1 = 0x00000003, +SQ_SEL_X = 0x00000004, +SQ_SEL_Y = 0x00000005, +SQ_SEL_Z = 0x00000006, +SQ_SEL_W = 0x00000007, +} SQ_SEL_XYZW01; + + union COMPUTE_TMPRING_SIZE { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 13; + unsigned int : 7; +#elif defined(BIGENDIAN_CPU) + unsigned int : 7; + unsigned int WAVESIZE : 13; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union COMPUTE_TMPRING_SIZE_GFX11 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 15; + unsigned int : 5; +#elif defined(BIGENDIAN_CPU) + unsigned int : 5; + unsigned int WAVESIZE : 15; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union COMPUTE_TMPRING_SIZE_GFX12 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 18; + unsigned int : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int : 2; + unsigned int WAVESIZE : 18; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union SQ_BUF_RSRC_WORD0 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS : 32; +#elif defined(BIGENDIAN_CPU) + unsigned int BASE_ADDRESS : 32; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD1 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS_HI : 16; + unsigned int STRIDE : 14; + unsigned int CACHE_SWIZZLE : 1; + unsigned int SWIZZLE_ENABLE : 1; +#elif defined(BIGENDIAN_CPU) + unsigned int SWIZZLE_ENABLE : 1; + unsigned int CACHE_SWIZZLE : 1; + unsigned int STRIDE : 14; + unsigned int BASE_ADDRESS_HI : 16; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union SQ_BUF_RSRC_WORD1_GFX11 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS_HI : 16; + unsigned int STRIDE : 14; + unsigned int SWIZZLE_ENABLE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int SWIZZLE_ENABLE : 2; + unsigned int STRIDE : 14; + unsigned int BASE_ADDRESS_HI : 16; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD2 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int NUM_RECORDS : 32; +#elif defined(BIGENDIAN_CPU) + unsigned int NUM_RECORDS : 32; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD3 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int NUM_FORMAT : 3; + unsigned int DATA_FORMAT : 4; + unsigned int ELEMENT_SIZE : 2; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int ATC__CI__VI : 1; + unsigned int HASH_ENABLE : 1; + unsigned int HEAP : 1; + unsigned int MTYPE__CI__VI : 3; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int MTYPE__CI__VI : 3; + unsigned int HEAP : 1; + unsigned int HASH_ENABLE : 1; + unsigned int ATC__CI__VI : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int ELEMENT_SIZE : 2; + unsigned int DATA_FORMAT : 4; + unsigned int NUM_FORMAT : 3; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union SQ_BUF_RSRC_WORD3_GFX10 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 7; + unsigned int RESERVED1 : 2; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int RESOURCE_LEVEL : 1; + unsigned int RESERVED2 : 3; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int RESERVED2 : 3; + unsigned int RESOURCE_LEVEL : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 2; + unsigned int FORMAT : 7; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + // From V# Table + union SQ_BUF_RSRC_WORD3_GFX11 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 6; + unsigned int RESERVED1 : 3; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int RESERVED2 : 4; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int RESERVED2 : 4; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 3; + unsigned int FORMAT : 6; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + // From V# Table + union SQ_BUF_RSRC_WORD3_GFX12 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 6; + unsigned int RESERVED1 : 3; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int WRITE_COMPRESS_ENABLE : 1; + unsigned int COMPRESSION_EN : 1; + unsigned int COMPRESSION_ACCESS_MODE : 2; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int COMPRESSION_ACCESS_MODE : 2; + unsigned int COMPRESSION_EN : 1; + unsigned int WRITE_COMPRESS_ENABLE : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 3; + unsigned int FORMAT : 6; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; +#endif // header guard diff --git a/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/thunk_proxy.h b/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/thunk_proxy.h new file mode 100644 index 0000000000..d6bdce2451 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/thunk_proxy.h @@ -0,0 +1,122 @@ +#ifndef _WSL_INC_THUNK_PROXY_H_ +#define _WSL_INC_THUNK_PROXY_H_ + +#include + +namespace thunk_proxy { +enum AllocDomain { + kSystem, + kLocal, + kUserMemory, + kUserQueue, + kDomainCount, +}; + +enum MemFlag { + kFineGrain = (1ULL << 0), + kKernarg = (1ULL << 1), +}; + +enum EngineFlag { + KCOMPUTE0 = (1ULL << 0), + KDRMDMA = (1ULL << 1), + KDRMDMA1 = (1ULL << 2), +}; + +enum SchedLevel { + kLow = 0, + kNormal = 1, + kHigh = 2, +}; + +struct HwsInfo { + union { + struct { + uint32_t gfxHwsEnabled : 1; + uint32_t computeHwsEnabled : 1; + uint32_t dmaHwsEnabled : 1; + uint32_t dma1HwsEnabled : 1; + uint32_t reserved : 28; + } hwsMask; + uint32_t osHwsEnableFlags; + }; + uint64_t engineOrdinalMask; // Indicates which engines (by ordinal) support MES HWS +}; + +typedef struct { + int major; + int minor; + int stepping; + bool is_dgpu; + char product_name[MAX_PATH]; + uint64_t uuid; + uint32_t family; + uint32_t device_id; + uint32_t wavefront_size; + uint32_t compute_unit_count; + uint32_t max_engine_clock_mhz; + uint32_t watch_points_num; + uint32_t pci_bus_addr; + uint32_t memory_bus_width; + uint32_t max_memory_clock_mhz; + uint64_t gpu_counter_frequency; + uint32_t wave_per_cu; + uint32_t simd_per_cu; + uint32_t max_scratch_slots_per_cu; + uint32_t num_shader_engine; + uint32_t shader_array_per_shader_engine; + uint32_t domain; + uint32_t num_gws; + uint32_t asic_revision; + uint64_t local_visible_heap_size; + uint64_t local_invisible_heap_size; + uint64_t non_local_heap_size; + uint64_t private_aperture_base; + uint64_t private_aperture_size; + uint64_t shared_aperture_base; + uint64_t shared_aperture_size; + uint32_t user_queue_size; + uint32_t lds_size; + uint32_t big_page_alignment_size; + uint32_t hw_big_page_min_alignment_size; + uint32_t hw_big_page_alignment_size; + bool enable_big_page_alignment; + uint32_t mec_fw_version; + uint32_t sdma_fw_version; + uint32_t l1_cache_size; + uint32_t l2_cache_size; + uint32_t l3_cache_size; + uint32_t gl2_cacheline_size; + uint32_t num_cp_queues; + HwsInfo hwsInfo; + std::vector sdma_schedid; + uint32_t compute_schedid; + bool state_shadowing_by_cpfw; + bool platform_atomic_support; + void *adapter_info; + uint32_t kmd_version; +} DeviceInfo; + +int EngineOrdinal(int engine, DeviceInfo *device_info); +bool GetHwsEnabled(int engine, DeviceInfo *device_info); +bool ShouldDisableGpuTimeout(int engine, DeviceInfo *device_info); +bool ParseAdapterInfo(D3DKMT_HANDLE adapter, DeviceInfo *device_info); +bool QueryAdapterSupported(unsigned int device_id); + +uint32_t QueueEngine2EngineFlag(uint32_t queue_engine); +void SetAllocationInfo(void *data, uint64_t size, AllocDomain domain, + uint64_t addr, uint32_t mem_flags, uint32_t engine_flag, const DeviceInfo &device_info); +void GetAllocPrivDataSize(int *priv_drv_data_size, int *priv_alloc_data_size); +void FillinAllocPrivDrvData(void *drv_priv, int priv_alloc_data_size); + +int GetSubmitPrivDataSize(); +void FillinSubmitPrivData(void *priv_data, D3DKMT_HANDLE queue, uint64_t command_addr, + uint64_t command_size, bool is_hw_queue); +int GetHwQueuePrivDataSize(); +void FillinHwQueuePrivData(void *priv_data, bool FwManagedGfxState, SchedLevel level = kNormal); +int GetContextPrivDataSize(); +void FillinContextPrivData(void *priv_data, bool FwManagedGfxState); +int GetPowerOptPrivDataSize(); +void FillinPowerOptPrivData(void *priv_data, bool restore); +} +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/wddm_types.h b/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/wddm_types.h new file mode 100644 index 0000000000..3fd3f69553 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/wddm_types.h @@ -0,0 +1,169 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_THUNK_PROXY_WDDM_TYPES_H_ +#define _WSL_INC_THUNK_PROXY_WDDM_TYPES_H_ + +#include + +#include + +typedef uint32_t UINT, *UINT_PTR; +typedef int32_t INT32; +typedef int32_t LONG; +typedef uint32_t ULONG, *ULONG_PTR; +typedef int64_t LONGLONG; +typedef int64_t LONG64; +typedef uint64_t ULONGLONG; +typedef uint64_t ULONG64, *ULONG64_PTR; +typedef uint8_t BYTE; +typedef uint16_t WORD; +typedef uint32_t DWORD; +typedef int32_t BOOL; +typedef int32_t NTSTATUS; +typedef uint16_t USHORT; +typedef uint16_t UINT16; +typedef uint32_t UINT32; +typedef uint64_t UINT64; +typedef int32_t INT; +typedef uint64_t SIZE_T; +typedef void VOID; +typedef float FLOAT; +typedef char CHAR; +typedef unsigned char UCHAR; +typedef UCHAR BOOLEAN; +typedef int16_t WCHAR; +typedef void *HANDLE; +typedef void *PVOID; +typedef void *LPVOID; +typedef const int16_t *PCWSTR; + +#define ULONG ULONG +#define ULONG_PTR ULONG_PTR +#define USHORT USHORT + +#define DECLARE_HANDLE(name) struct name##__{int unused;}; typedef struct name##__ *name +#define C_ASSERT(e) typedef char __C_ASSERT__[(e)?1:-1] + +DECLARE_HANDLE(HWND); +DECLARE_HANDLE(HDC); +DECLARE_HANDLE(PALETTEENTRY); + +typedef struct tagPOINT { + LONG x; + LONG y; +} POINT; + +typedef struct tagRECT { + LONG left; + LONG top; + LONG right; + LONG bottom; +} RECT; + +typedef struct tagRECTL { + LONG left; + LONG top; + LONG right; + LONG bottom; +} RECTL; + +typedef union _LARGE_INTEGER { + struct { + DWORD LowPart; + DWORD HighPart; + } u; + LONGLONG QuadPart; +} LARGE_INTEGER; + +typedef LARGE_INTEGER *PLARGE_INTEGER; + +typedef union _ULARGE_INTEGER { + struct { + ULONG LowPart; + ULONG HighPart; + } DUMMYSTRUCTNAME; + struct { + ULONG LowPart; + ULONG HighPart; + } u; + ULONGLONG QuadPart; +} ULARGE_INTEGER; + +typedef ULARGE_INTEGER *PULARGE_INTEGER; + +typedef struct _LUID { + ULONG LowPart; + LONG HighPart; +} LUID, *PLUID; + +typedef enum _DEVICE_POWER_STATE { + PowerDeviceUnspecified = 0, + PowerDeviceD0, + PowerDeviceD1, + PowerDeviceD2, + PowerDeviceD3, + PowerDeviceMaximum +} DEVICE_POWER_STATE, *PDEVICE_POWER_STATE; + +#define _Check_return_ +#define APIENTRY +#define CONST const +#define IN +#define OUT +#define FAR +#define MAX_PATH 260 +#define __stdcall + +#ifndef GUID_DEFINED +#define GUID_DEFINED +typedef struct _GUID { + uint32_t Data1; + uint16_t Data2; + uint16_t Data3; + uint8_t Data4[ 8 ]; +} GUID; +#endif + +#include + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/cmd_util.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/cmd_util.h new file mode 100644 index 0000000000..f1e7d22d91 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/cmd_util.h @@ -0,0 +1,82 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */ + +#ifndef _WSL_INC_WDDM_CMD_UTIL_H_ +#define _WSL_INC_WDDM_CMD_UTIL_H_ + +#include +#include "impl/hsa/hsa.h" +#include "impl/hsa/amd_hsa_queue.h" +#include "impl/hsa/amd_hsa_kernel_code.h" +#include "impl/pm4_cmds.h" +#include "util/utils.h" + +namespace wsl { +namespace thunk { + +struct DispatchInfo { + uint8_t major; + hsa_kernel_dispatch_packet_t *pPacket; + void *pEntry; + const amd_kernel_code_t *pKernelObject; + uint32_t ldsBlks; + amd_queue_v2_t *pAmdQueue; + bool wave32; + uint32_t srd; + void *pScratchBase; + uint32_t scratchSizePerWave; + uint32_t scratchBaseOffset[2]; + uint32_t offsetCnt; +}; + +class CmdUtil { +public: + CmdUtil() {}; + ~CmdUtil() {}; + + static size_t BuildCopyData( + uint64_t *pDstAddr, + void *pBuffer, + uint32_t dstSel = dst_sel__mec_copy_data__tc_l2, + uint32_t dstCachePolicy = dst_cache_policy__mec_copy_data__stream, + uint32_t srcSel = src_sel__mec_copy_data__gpu_clock_count, + uint32_t srcCachePolicy = src_cache_policy__mec_copy_data__lru, + uint32_t countSel = count_sel__mec_copy_data__64_bits_of_data, + uint32_t wrConfirm = wr_confirm__mec_copy_data__wait_for_confirmation); + + static size_t BuildBarrier( + void *pBuffer, + uint32_t eventIndex = event_index__mec_event_write__cs_partial_flush, + uint32_t eventType = CS_PARTIAL_FLUSH); + + static size_t BuildWriteData64Command( + void *pBuffer, + uint64_t* write_addr, + uint64_t write_value); + + static size_t BuildAcquireMem( + uint8_t major, + void *pBuffer); + + static size_t BuildScratch( + void *pScratchBase, + void *pBuffer); + + static size_t BuildComputeShaderParams( + void *pBuffer); + + static size_t BuildDispatch( + struct DispatchInfo *pInfo, + void *pBuffer); + + static size_t BuildAtomicMem( + uint64_t *pAddr, + uint32_t atomic, + void *pBuffer, + uint32_t cachePolicy = cache_policy__mec_atomic_mem__stream, + uint64_t srcData = 1); +}; + +} // namespace thunk +} // namespace wsl + +#endif \ No newline at end of file diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/device.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/device.h new file mode 100644 index 0000000000..15821b5483 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/device.h @@ -0,0 +1,246 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_DEVICE_H_ +#define _WSL_INC_WDDM_DEVICE_H_ + +#include +#include + +#include +#include +#include + +#include "impl/wddm/types.h" +#include "impl/thunk_proxy/thunk_proxy.h" +#include "impl/wddm/va_mgr.h" +#include "impl/wddm/status.h" +#include "impl/wddm/types.h" +#include "impl/wddm/gpu_memory.h" +#include "impl/wddm/cmd_util.h" + +namespace wsl { +namespace thunk { + +//class Queue; +class WDDMQueue; + +// WSL2 hyperv GPADL protocol limitation +#define MAX_USERPTR_BLOCK_SIZE 0xf0000000 +#define START_NON_CANONICAL_ADDR (1ULL << 47) +#define END_NON_CANONICAL_ADDR (~0UL - (1UL << 47)) +#define IS_OVERLAPPING(start1, size1, start2, size2) \ + ((start1 < (start2 + size2)) && (start2 < (start1 + size1))) + +struct SegmentInfo { + uint32_t segment_id; + uint32_t segment_type; // 0=aperture, 1=gpu memory, 2=system memory + bool aperture; + bool system_memory; + uint64_t commit_limit; + + SegmentInfo() + : segment_id(0), segment_type(0), aperture(false), + system_memory(false), commit_limit(0) {} +}; + +class WDDMDevice { +public: + static constexpr size_t GpuMemoryChunkSize = 2 * (1ULL << 30); // 2 GB + + WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid, uint32_t node_id); + ~WDDMDevice(); + + int NodeId() const { return node_id_; } + int Major() { return device_info_.major; } + int Minor() { return device_info_.minor; } + int Stepping() { return device_info_.stepping; } + bool IsDgpu() { return device_info_.is_dgpu; } + const char *ProductName() { return device_info_.product_name; } + uint64_t Uuid() { return device_info_.uuid; } + uint32_t GfxFamily() { return device_info_.family; } + uint32_t DeviceId() { return device_info_.device_id; } + uint32_t WavefrontSize() { return device_info_.wavefront_size; } + uint32_t ComputeUnitCount() { return device_info_.compute_unit_count; } + uint32_t MaxEngineClockMhz() { return device_info_.max_engine_clock_mhz; } + uint32_t WatchPointsNum() { return device_info_.watch_points_num; } + uint32_t PciBusAddr() { return device_info_.pci_bus_addr; } + + uint32_t MemoryBusWidth() { return device_info_.memory_bus_width; } + uint32_t MaxMemoryClockMhz() { return device_info_.max_memory_clock_mhz; } + uint32_t WavePerCu() { return device_info_.wave_per_cu; } + uint32_t SimdPerCu() { return device_info_.simd_per_cu; } + uint32_t MaxScratchSlotsPerCu() { return device_info_.max_scratch_slots_per_cu; } + uint32_t NumShaderEngine() { return device_info_.num_shader_engine; } + uint32_t ShaderArrayPerShaderEngine() { return device_info_.shader_array_per_shader_engine; } + uint32_t NumSdmaEngine() { return device_info_.sdma_schedid.size(); } + uint32_t Domain() { return device_info_.domain; } + uint32_t NumGws() { return device_info_.num_gws; } + uint32_t AsicRevision() { return device_info_.asic_revision; } + uint64_t LocalHeapSize() { return device_info_.local_visible_heap_size + device_info_.local_invisible_heap_size; } + uint64_t LocalVisibleHeapSize() { return device_info_.local_visible_heap_size; } + uint64_t LocalInvisibleHeapSize() { return device_info_.local_invisible_heap_size; } + uint64_t NonLocalHeapSize() { return device_info_.non_local_heap_size; } + uint64_t PrivateApertureBase() { return device_info_.private_aperture_base; } + uint64_t PrivateApertureSize() { return device_info_.private_aperture_size; } + uint64_t SharedApertureBase() { return device_info_.shared_aperture_base; } + uint64_t SharedApertureSize() { return device_info_.shared_aperture_size; } + uint32_t LdsSize() { return device_info_.lds_size; } + uint64_t GPUCounterFrequency() { return device_info_.gpu_counter_frequency; } + uint32_t GetSwsQueueSize(void) const { return device_info_.user_queue_size; } + uint32_t GetMecFwVersion() { return device_info_.mec_fw_version; } + uint32_t GetSdmaFwVersion() { return device_info_.sdma_fw_version; } + uint32_t GetL1CacheSize() { return device_info_.l1_cache_size; } + uint32_t GetL2CacheSize() { return device_info_.l2_cache_size; } + uint32_t GetL3CacheSize() { return device_info_.l3_cache_size; } + uint32_t Gl2CacheLineSize() { return device_info_.gl2_cacheline_size; } + bool SupportStateShadowingByCpFw(void) const { return device_info_.state_shadowing_by_cpfw; } + bool SupportPlatformAtomic(void) const { return device_info_.platform_atomic_support; } + uint32_t GetSdmaEngine(uint32_t idx) { + assert(idx < NumSdmaEngine()); + return device_info_.sdma_schedid[idx]; + } + uint32_t GetComputeEngine() { return device_info_.compute_schedid; } + + uint64_t VramAvail(); + + void GetClockCounters(uint64_t *gpu, uint64_t *cpu); + uint32_t GetNumCpQueues() { return device_info_.num_cp_queues; } + + bool CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr); + void DestroySyncobj(D3DKMT_HANDLE handle); + + bool CreateQueue(WDDMQueue *queue); + void DestroyQueue(WDDMQueue *queue); + bool CreateHwQueue(WDDMQueue *queue); + bool DestroyHwQueue(WDDMQueue *queue); + bool SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value); + bool SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value); + + bool WaitPagingFence(WDDMQueue *queue) { + uint64_t value = page_fence_value_; + + if (*page_fence_addr_ < value && + !GpuWait(queue, &page_syncobj_, &value, 1)) + return false; + + return true; + } + + bool GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs, + uint64_t *values, int count); + bool GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs, + uint64_t *value, int count); + bool CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value, + int count, bool wait_any); + bool WaitOnPagingFenceFromCpu(); + + uint32_t LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt); + uint32_t GetCmdbufSize(void) const { return cmdbuf_size_; } + uint32_t GetAqlFrameSize(void) const { return cmdbuf_aql_frame_size_; } + static uint32_t GetAqlFrameNum(void) { return cmdbuf_aql_frame_num_; } + + // Both legacy HWS and stage 1 HWS use KMD to alloc use queue memory, + // return false by default + bool AllocUserQueueMemFromUMD(void) const { return false; } + + bool IsHwsEnabled(int engine) { + return thunk_proxy::GetHwsEnabled(engine, &device_info_); + } + + void UpdatePageFence(uint64_t fence_value); + + D3DKMT_HANDLE PagingQueue() const { return page_queue_; } + D3DKMT_HANDLE PagingFence() const { return page_syncobj_; } + D3DKMT_HANDLE DeviceHandle() const { return device_; } + LUID GetLuid() const { return adapter_luid_; } + D3DKMT_HANDLE GetAdapter() const { return adapter_; } + + const thunk_proxy::DeviceInfo& DeviceInfo() const { return device_info_; } + + ErrorCode CreateGpuMemory(const GpuMemoryCreateInfo &create_info, GpuMemory **gpu_mem, gpusize *gpu_va = nullptr); + +private: + bool ParseDeviceInfo(void); + void DestroyDeviceInfo(void); + bool CreateDevice(void); + bool DestroyDevice(void); + bool CreatePagingQueue(void); + bool DestroyPagingQueue(void); + void *Lock(D3DKMT_HANDLE handle); + bool Unlock(D3DKMT_HANDLE handle); + bool CreateContext(int engine, D3DKMT_HANDLE *handle); + bool DestroyContext(D3DKMT_HANDLE handle); + + void SetPowerOptimization(bool restore); + void InitCmdbufInfo(void); + + bool QuerySegmentInfo(); + bool GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE segment_type, uint32_t &segment_id); + + D3DKMT_HANDLE adapter_; + LUID adapter_luid_; + D3DKMT_HANDLE device_; + + D3DKMT_HANDLE page_queue_; + D3DKMT_HANDLE page_syncobj_; + uint64_t *page_fence_addr_; + std::atomic page_fence_value_; + + uint32_t cmdbuf_size_; + uint32_t cmdbuf_aql_frame_size_; + static const uint32_t cmdbuf_aql_frame_num_; + uint32_t node_id_; + // device info + thunk_proxy::DeviceInfo device_info_; + std::vector segment_infos_; + //CmdUtil cmd_util; +}; + +NTSTATUS WDDMCreateDevices(std::vector &devices); + +} // namespace thunk +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/gpu_memory.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/gpu_memory.h new file mode 100644 index 0000000000..9703a6d2c7 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/gpu_memory.h @@ -0,0 +1,249 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_GPU_MEMORY_H_ +#define _WSL_INC_WDDM_GPU_MEMORY_H_ + +#include +#include +#include "util/utils.h" +#include "impl/wddm/types.h" +#include "impl/wddm/thunks.h" +#include "impl/thunk_proxy/thunk_proxy.h" + +namespace wsl { +namespace thunk { + +class WDDMDevice; + +union GpuMemoryCreateFlags { + struct { + uint64_t virtual_alloc : 1; // only allocate virtual address, without physical buffer + uint64_t physical_only : 1; // only allocate physical buffer, without virutal address + uint64_t interprocess : 1; // physical buffer need share info between exporter and importer + uint64_t locked : 1; // lock virtual address space into RAM, preventing that memory from being paged to the swap area + uint64_t physical_contiguous : 1; // contiguous physical pages + uint64_t sysmem_ipc_sig_importer : 1; // allocate system memory for IPC signal + uint64_t sysmem_ipc_sig_exporter : 1; // allocate system memory for IPC signal, prepare to export + uint64_t alloc_va : 1; // allocate va. 0 for vmem import + uint64_t blit_kernel_object : 1; // allocate executable blit kernel object + uint64_t unused : 55; + }; + uint64_t reserved; +}; + +union GpuMemoryDescFlags { + struct { + uint32_t is_virtual : 1; + uint32_t is_shared : 1; + uint32_t is_external : 1; + uint32_t is_physical_only : 1; + uint32_t is_locked : 1; + uint32_t is_queue_referenced : 1; + uint32_t is_physical_contiguous : 1; + uint32_t is_imported_sys_memfd : 1; // 0 - ignored; 1 - va from system heap + uint32_t is_sysmem_exporter : 1; // allocate system memory for IPC signal, prepare to export + uint32_t is_va_required :1; + uint32_t is_imported_vram_vmem :1; + uint32_t is_imported_vram_ipc :1; + uint32_t is_imported_from_same_process : 3; // imported from same process, record shared cnt + uint32_t is_blit_kernel_object : 1; // blit kernel object + uint32_t unused : 16; + }; + + uint32_t reserved; +}; + +struct GpuMemoryCreateInfo { + GpuMemoryCreateInfo() { + flags.reserved = 0; + domain = thunk_proxy::kLocal; + size = 0; + alignment = 0; + mem_flags = 0; + engine_flag = 0; + va_hint = 0; + user_ptr = nullptr; + dmabuf_fd = -1; + } + + GpuMemoryCreateFlags flags; + thunk_proxy::AllocDomain domain; + gpusize size; + gpusize alignment; + int mem_flags; + int engine_flag; + int dmabuf_fd; // Import from dmabuf + + void *user_ptr; + gpusize va_hint; +}; + +struct GpuMemoryDesc { + GpuMemoryDesc() { + gpu_addr = 0; + cpu_addr = nullptr; + client_size = 0; + size = alignment = 0; + flags.reserved = 0; + mem_flags = 0; + engine_flag = 0; + handle_ape_addr = 0; + } + + thunk_proxy::AllocDomain domain; + LUID adapter_luid; // Where is the backing store location + gpusize gpu_addr; + void *cpu_addr; + gpusize client_size; // user request size + gpusize size; + gpusize alignment; + gpusize handle_ape_addr; + + GpuMemoryDescFlags flags; + int mem_flags; + int engine_flag; +}; + +struct SharedHandleInfo { + thunk_proxy::AllocDomain domain; + LUID adapter_luid; + gpusize client_size; // user request size + uint64_t size; + uint32_t flags; + int mem_flags; + pid_t pid; + gpusize gpu_addr; +}; + +using GpuMemoryHandle = void *; + +class GpuMemory { +public: + static size_t CalcChunkNumbers(gpusize size); + + ErrorCode Init(const GpuMemoryCreateInfo &create_info); + + WDDMDevice *GetDevice() const { return device_; } + gpusize Size() const { return desc_.size; } + gpusize ClientSize() const { return desc_.client_size; } + uint64_t GpuAddress() const { return desc_.gpu_addr; } + void *CpuAddress() const { return desc_.cpu_addr; } + uint64_t HandleApeAddress() const { return desc_.handle_ape_addr; } + + inline bool IsLocal() const { return desc_.domain == thunk_proxy::kLocal; } + inline bool IsUserMemory() const { return desc_.domain == thunk_proxy::kUserMemory; } + inline bool IsSystem() const { return desc_.domain == thunk_proxy::kSystem; } + inline bool IsSysMemFd() const { return desc_.flags.is_imported_sys_memfd; } + inline bool IsUserQueue() const { return desc_.domain == thunk_proxy::kUserQueue; } + inline bool IsPhysicalOnly() const { return desc_.flags.is_physical_only; } + inline bool IsPhysicalContiguous() const { return desc_.flags.is_physical_contiguous; } + inline bool IsVirtual() const { return desc_.flags.is_virtual; } + inline bool IsShared() const { return desc_.flags.is_shared; } + inline bool IsExternal() const { return desc_.flags.is_external; } + inline bool IsVaAllocated() const { return desc_.flags.is_va_required; } + inline bool IsBlitKernelObject() const { return desc_.flags.is_blit_kernel_object; } + + inline uint32_t Flags() const { return desc_.flags.reserved; } + inline int GetAllocInfo() const { return desc_.mem_flags; } + inline bool IsFineGrain() const { return (desc_.mem_flags & thunk_proxy::kFineGrain); } + inline bool IsSameAdapter(const LUID &luid) const { + return (desc_.adapter_luid.HighPart == luid.HighPart && + desc_.adapter_luid.LowPart == luid.LowPart); + } + inline void GetQueueReference() { desc_.flags.is_queue_referenced = 1; } + inline void PutQueueReference() { desc_.flags.is_queue_referenced = 0; } + inline bool IsQueueReferenced() const { return desc_.flags.is_queue_referenced; } + inline void IncSharedReference() { desc_.flags.is_imported_from_same_process++; } + inline uint32_t DecSharedReference() { return (desc_.flags.is_imported_from_same_process == 0) ? 0 : --desc_.flags.is_imported_from_same_process; } + inline bool IsSharedFromSameProcess() const { return desc_.flags.is_imported_from_same_process > 0; } + + WinAllocationHandle GetAllocationHandle(size_t index) const { return alloc_handles_ptr_[index]; } + size_t NumChunks() const { return num_allocations_; } + + const GpuMemoryHandle GetGpuMemoryHandle() const { + return reinterpret_cast(const_cast(this)); + } + + static GpuMemory *Convert(GpuMemoryHandle handle) { return reinterpret_cast(handle); } + + ErrorCode ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize va_size, gpusize alignment); + ErrorCode FreeGpuVirtualAddress(gpusize va_start_address, gpusize va_size); + + ErrorCode MapGpuVirtualAddress(const gpusize map_addr, const gpusize size, gpusize offset = 0); + ErrorCode UnmapGpuVirtualAddress(const gpusize map_addr, const gpusize size, gpusize offset = 0); + + ErrorCode MakeResident(); + ErrorCode Evict(); + + ErrorCode ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags = SHARED_ALLOCATION_ALL_ACCESS); + ErrorCode ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info, gpusize *gpu_addr = nullptr); + ~GpuMemory(); +protected: + explicit GpuMemory(WDDMDevice *device); +private: + ErrorCode CreatePhysicalMemory(); + ErrorCode FreePhysicalMemory(); + + uint64_t AdjustSize(gpusize size) const; +private: + friend class WDDMDevice; + + WDDMDevice *const device_; + + GpuMemoryDesc desc_; + + size_t num_allocations_; + WinAllocationHandle *alloc_handles_ptr_; + WinAllocationHandle alloc_handle_; // Optimization for num_allocations_ is 1 + + WinResourceHandle resource_; // Handle to a resource object that wraps the allocation. Used for shared resources + + int mem_fd_; // IPC sigal's sys mem fd + + DISALLOW_COPY_AND_ASSIGN(GpuMemory); +}; + +} // namespace thunk +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/queue.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/queue.h new file mode 100644 index 0000000000..0e936c5721 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/queue.h @@ -0,0 +1,370 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// +#ifndef _WSL_INC_WDDM_QUEUE_H_ +#define _WSL_INC_WDDM_QUEUE_H_ + +#include +#include +#include +#include +#include +#include "impl/wddm/types.h" +#include "impl/wddm/device.h" +#include "impl/wddm/gpu_memory.h" +#include "impl/hsa/hsa_ext_amd.h" +#include "impl/hsa/amd_hsa_queue.h" +#include "impl/hsa/amd_hsa_signal.h" +#include "impl/wddm/cmd_util.h" + +namespace wsl { +namespace thunk { + +class Queue; +class WDDMDevice; + +class WDDMQueue { +public: + WDDMQueue(WDDMDevice *device, + uint64_t cmdbuf_addr, + uint32_t cmdbuf_size, + uint32_t engine, + bool use_hws = true) : + device(device), + context(0), + queue(0), + syncobj(0), + sync_addr(NULL), + cmdbuf(0), + cmdbuf_addr(cmdbuf_addr), + cmdbuf_size(cmdbuf_size), + queue_engine(engine), + use_hws(use_hws), + prio(thunk_proxy::kNormal) { + + } + + virtual ~WDDMQueue() { } + + virtual hsa_status_t Init(void) { return HSA_STATUS_SUCCESS; } + virtual hsa_status_t Fini(void) { return HSA_STATUS_SUCCESS; } + virtual void RingDoorbell() { } + virtual void* GetHsaQueueAddr(void) const { return reinterpret_cast(GetCmdbufAddr()); } + + hsa_status_t SwsInit(void); + hsa_status_t SwsFini(void); + hsa_status_t SwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value); + + hsa_status_t HwsInit(void); + hsa_status_t HwsFini(void); + hsa_status_t HwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value); + hsa_status_t SetPriority(hsa_amd_queue_priority_t priority); + + uint64_t *GetSyncAddr(void) const { return sync_addr; } + uint64_t GetCmdbufAddr(void) const { return cmdbuf_addr; } + + thunk_proxy::SchedLevel ConvertSchedLevel(hsa_amd_queue_priority_t prio) const { + switch (prio) { + case HSA_AMD_QUEUE_PRIORITY_LOW: + return thunk_proxy::kLow; + case HSA_AMD_QUEUE_PRIORITY_HIGH: + return thunk_proxy::kHigh; + case HSA_AMD_QUEUE_PRIORITY_NORMAL: + default: + return thunk_proxy::kNormal; + } + } + + WDDMDevice *device; + + D3DKMT_HANDLE context; + D3DKMT_HANDLE queue; + + D3DKMT_HANDLE syncobj; + uint64_t *sync_addr; + + GpuMemoryHandle cmdbuf; + uint64_t cmdbuf_addr; + uint32_t cmdbuf_size; + + GpuMemoryHandle queue_mem; + uint64_t queue_addr; + + uint32_t queue_engine; + + bool use_hws; + thunk_proxy::SchedLevel prio; +}; + +class ComputeQueue : public WDDMQueue { +public: + ComputeQueue(WDDMDevice *device, + void *ring, + uint64_t ring_size, + std::atomic *ring_wptr, + std::atomic *ring_rptr, + volatile int64_t *error_addr, + uint32_t cmdbuf_size, + uint32_t engine, + bool use_hws = true); + + ~ComputeQueue(); + + virtual hsa_status_t Init(void); + virtual hsa_status_t Fini(void); + virtual hsa_status_t Submit(void); + + void* GetRing(void) const { return ring; } + uint64_t GetRingSize(void) const { return ring_size; } + std::atomic* GetRingWptr(void) const { return ring_wptr; } + std::atomic* GetRingRptr(void) const { return ring_rptr; } + + uint64_t GetAqlWriteIndex(void) const { return cmdbuf_aql_frame_write_index; } + uint32_t GetAqlFrameSize(void) const { return cmdbuf_aql_frame_size; } + void* GetHsaQueueAddr(void) const { return ring; } + + bool IsInvalidPacket(void) const { + uint16_t *packet = (uint16_t *)((char *)ring + + (cmdbuf_aql_frame_write_index % ring_size) * 64); + return ((*packet >> HSA_PACKET_HEADER_TYPE) & ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1)) + == HSA_PACKET_TYPE_INVALID; + } + + hsa_status_t Process(void); + uint64_t * GetDoorbellPtr() const { return (uint64_t *)&doorbell_signal_value_; } + void RingDoorbell(); +private: + hsa_status_t KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet); + hsa_status_t BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or = false); + + uint64_t CalcDispatchGroups(hsa_kernel_dispatch_packet_t *packet); + uint64_t CalcDispatchWavesPerGroup(hsa_kernel_dispatch_packet_t *packet, bool wave32); + + struct amd_aql_pm4_ib { + uint16_t header; + uint16_t ven_hdr; + uint32_t ib_jump_cmd[4]; + uint32_t dw_cnt_remain; + uint32_t reserved[8]; + hsa_signal_t completion_signal; + }; + hsa_status_t VendorSpecificAqlToPm4(char *cpu, amd_aql_pm4_ib *packet); + hsa_status_t SwitchAql2PM4(void); + + hsa_status_t PreSubmit(void); + hsa_status_t EndSubmit(void); + + void *ring; + uint64_t ring_size; + std::atomic *ring_wptr; + std::atomic *ring_rptr; + + // ib_start_addr is the current ib start address + uint64_t ib_start_addr; + + // ib_size is the current ib size. + uint64_t ib_size; + + // record the last submitted aql frame write index + uint64_t sync_point; + + uint64_t cmdbuf_aql_frame_write_index; + uint32_t cmdbuf_aql_frame_size; + + uint64_t *signal_addr_; + bool platform_atomic_support_; + bool needs_barrier; + bool ready_to_submit; + + CmdUtil cmd_util; + +private: + bool EnableProfiling() { + return AMD_HSA_BITS_GET(amd_queue_rocr_->queue_properties, AMD_QUEUE_PROPERTIES_ENABLE_PROFILING); + } + void HandleError(hsa_status_t status); + bool UpdateScratch(hsa_kernel_dispatch_packet_t *packet, bool wave32); + + uint32_t UpdateIndexStride(uint32_t srd, bool wave32); + + void *ScratchBase() { return scratch_base_; } + + void AppendCmdbufSratchBaseOffset(int offset) { + scratch_base_offset_array_.push_back(offset); + } + + bool RelocateCmdbufScratchBase(uint64_t addr); + + uint32_t ScratchSizePerWave() { return scratch_size_per_wave_; } + uint64_t GetKernelObjAddr(uint64_t addr) const; + void InitScratchSRD(); + GpuMemoryHandle amd_queue_mem_; + amd_queue_v2_t *amd_queue_; + amd_queue_v2_t *amd_queue_rocr_; + uint64_t doorbell_signal_value_; + volatile std::atomic *error_code_; + std::thread aql_to_pm4_thread_; + bool thread_stop_; + std::mutex thread_cond_lock_; + std::condition_variable thread_cond_; + static void AqlToPm4Thread(ComputeQueue *queue); + + uint64_t max_scratch_waves_; + uint64_t dispatch_waves_; + uint64_t scratch_size_per_wave_; + uint64_t scratch_size_; + uint64_t total_scratch_size_; + void *scratch_base_; + uint32_t scratch_mem_alignment_size_; + GpuMemoryHandle scratch_mem_; + + std::vector scratch_base_offset_array_; +}; + +class SDMAQueue : public WDDMQueue { +public: + SDMAQueue(WDDMDevice *device, + void *ring, + uint64_t cmdbuf_size, + uint32_t engine, + bool use_hws = true); + + virtual ~SDMAQueue(); + + hsa_status_t Init(void); + hsa_status_t Fini(void); + hsa_status_t Submit(void); + + int PreparePacket(uint32_t offset, uint64_t size); + + void WaitQueue(void) { + device->CpuWait(&syncobj, &rptr_next, 1, false); + } + + uint64_t * GetRingWptr(void) { return &wptr_next_; } + uint64_t * GetRingRptr(void) { return WDDMQueue::GetSyncAddr(); } + uint64_t * GetDoorbellPtr() { return &doorbell_; } + void RingDoorbell(); + void* GetHsaQueueAddr(void) const { return reinterpret_cast(GetCmdbufAddr()); } + +private: + uint64_t wptr_next_; + uint64_t wptr_pre_; + uint64_t rptr_next; + uint64_t doorbell_; + std::vector> wptr_queue_; + uint64_t ib_size; + uint64_t ib_start_addr; + + std::thread thread_; + bool thread_stop_; + std::mutex thread_cond_lock_; + std::condition_variable thread_cond_; + static void SdmaThread(SDMAQueue *queue); + + struct SDMA_PKT_POLL_REGMEM { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved_0 : 10; + unsigned int hdp_flush : 1; + unsigned int reserved_1 : 1; + unsigned int func : 3; + unsigned int mem_poll : 1; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } ADDR_LO_UNION; + + union { + struct { + unsigned int addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } ADDR_HI_UNION; + + union { + struct { + unsigned int value : 32; + }; + unsigned int DW_3_DATA; + } VALUE_UNION; + + union { + struct { + unsigned int mask : 32; + }; + unsigned int DW_4_DATA; + } MASK_UNION; + + union { + struct { + unsigned int interval : 16; + unsigned int retry_count : 12; + unsigned int reserved_0 : 4; + }; + unsigned int DW_5_DATA; + } DW5_UNION; + }; + const unsigned int SDMA_OP_POLL_REGMEM = 8; + bool IsPollPacket(SDMA_PKT_POLL_REGMEM* pkt) { + return pkt->HEADER_UNION.op == SDMA_OP_POLL_REGMEM && + pkt->HEADER_UNION.mem_poll == 1 && + pkt->HEADER_UNION.func == 3; + } + uint32_t WrapIntoRocrRing(uint64_t idx) { return (idx & (cmdbuf_size - 1)); } +}; + +} // namespace thunk +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/status.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/status.h new file mode 100644 index 0000000000..0efd9559fd --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/status.h @@ -0,0 +1,61 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_STATUS_H +#define _WSL_INC_WDDM_STATUS_H + +enum class ErrorCode { + Success, + DeviceLost, + UnSupported, + NotReady, + OutOfMemory, + OutOfGpuMemory, + OutOfHandleApeMemory, + Timeout, + SyscallFail, + InvalidateParams, + SameProcessSameDevice, + Unknown, +}; + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/thunks.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/thunks.h new file mode 100644 index 0000000000..68f0015d6d --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/thunks.h @@ -0,0 +1,233 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_THUNKS_H +#define _WSL_INC_WDDM_THUNKS_H + +#include "impl/wddm/status.h" +#include "impl/wddm/types.h" +#include "dxcore_loader.h" + +namespace wsl { +namespace thunk { + +inline ErrorCode TranslateNtStatus(NTSTATUS status) { + switch (status) { + case STATUS_SUCCESS: + return ErrorCode::Success; + case STATUS_PENDING: + return ErrorCode::NotReady; + case STATUS_NO_MEMORY: + return ErrorCode::OutOfMemory; + case STATUS_DEVICE_REMOVED: + return ErrorCode::DeviceLost; + case STATUS_GRAPHICS_NO_VIDEO_MEMORY: + return ErrorCode::OutOfGpuMemory; + case STATUS_TIMEOUT: + return ErrorCode::Timeout; + case STATUS_INVALID_PARAMETER: + return ErrorCode::InvalidateParams; + default: + break; + } + return ErrorCode::Unknown; +} + +namespace d3dthunk { + +typedef D3DKMT_CREATEALLOCATION CreateAllocationArgs; +typedef D3DKMT_CREATECONTEXT CreateContextArgs; +typedef D3DKMT_CREATECONTEXTVIRTUAL CreateContextVirtualArgs; +typedef D3DKMT_CREATEPAGINGQUEUE CreatePagingQueueArgs; +typedef D3DKMT_CREATESYNCHRONIZATIONOBJECT CreateSynchronizationObjectArgs; +typedef D3DKMT_CREATESYNCHRONIZATIONOBJECT2 CreateSynchronizationObject2Args; +typedef D3DKMT_ESCAPE EscapeArgs; +typedef D3DKMT_EVICT EvictArgs; +typedef D3DKMT_FREEGPUVIRTUALADDRESS FreeGpuVirtualAddressArgs; +typedef D3DKMT_LOCK LockArgs; +typedef D3DKMT_LOCK2 Lock2Args; +typedef D3DKMT_OPENRESOURCE OpenResourceArgs; +typedef D3DKMT_OPENRESOURCEFROMNTHANDLE OpenResourceFromNtHandleArgs; +typedef D3DKMT_QUERYADAPTERINFO QueryAdapterInfoArgs; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECT SignalSynchronizationObjectArgs; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECT2 SignalSynchronizationObject2Args; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMCPU SignalSynchronizationObjectFromCpuArgs; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU2 SignalSynchronizationObjectFromGpuArgs; +typedef D3DKMT_SUBMITCOMMAND SubmitCommandArgs; +typedef D3DKMT_UNLOCK UnlockArgs; +typedef D3DKMT_UNLOCK2 Unlock2Args; +typedef D3DKMT_UPDATEGPUVIRTUALADDRESS UpdateGpuVirtualAddressArgs; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECT WaitForSynchronizationObjectArgs; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECT2 WaitForSynchronizationObject2Args; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU WaitForSynchronizationObjectFromCpuArgs; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU WaitForSynchronizationObjectFromGpuArgs; +typedef D3DKMT_ACQUIREKEYEDMUTEX AcquireKeyedMutexArgs; +typedef D3DKMT_RELEASEKEYEDMUTEX ReleaseKeyedMutexArgs; +typedef D3DKMT_OPENKEYEDMUTEX OpenKeyedMutexArgs; +typedef D3DKMT_DESTROYKEYEDMUTEX DestroyKeyedMutexArgs; +typedef D3DKMT_QUERYVIDEOMEMORYINFO QueryVideoMemoryInfoArgs; +typedef D3DKMT_CREATEHWQUEUE CreateHwQueueArgs; +typedef D3DKMT_DESTROYHWQUEUE DestroyHwQueueArgs; +typedef D3DKMT_SUBMITCOMMANDTOHWQUEUE SubmitCommandToHwQueueArgs; +typedef D3DKMT_SUBMITPRESENTTOHWQUEUE SubmitPresentToHwQueueArgs; +typedef D3DKMT_SUBMITSIGNALSYNCOBJECTSTOHWQUEUE SubmitSignalSyncObjectsToHwQueueArgs; +typedef D3DKMT_SUBMITWAITFORSYNCOBJECTSTOHWQUEUE SubmitWaitForSyncObjectsToHwQueueArgs; +typedef D3DKMT_CREATESYNCFILE CreateSyncFileArgs; + +inline ErrorCode MapGpuVirtualAddress(D3DDDI_MAPGPUVIRTUALADDRESS *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTMapGpuVirtualAddress(args))); +} + +inline ErrorCode CreateAllocation(CreateAllocationArgs *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTCreateAllocation2(args))); +} + +inline ErrorCode DestroyAllocation( + WinDeviceHandle device, + WinResourceHandle resource, + size_t num_allocations, + const WinAllocationHandle *alloc_handles) { + + D3DKMT_DESTROYALLOCATION2 args{}; + + memset(&args, 0, sizeof(args)); + args.hDevice = device; + if (resource) { + args.hResource = resource; + } else { + args.phAllocationList = alloc_handles; + args.AllocationCount = num_allocations; + } + + return TranslateNtStatus(DXCORE_CALL(D3DKMTDestroyAllocation2(&args))); +} + +inline ErrorCode ReserveGpuVirtualAddress(D3DDDI_RESERVEGPUVIRTUALADDRESS *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTReserveGpuVirtualAddress(args))); +} + +inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle, + gpusize size, + gpusize base_address, + gpusize *out_addr) { + D3DDDI_RESERVEGPUVIRTUALADDRESS args{}; + args.hPagingQueue = handle; + args.Size = size; + args.BaseAddress = base_address; + + auto code = ReserveGpuVirtualAddress(&args); + if (code == ErrorCode::Success) + *out_addr = args.VirtualAddress; + return code; +} + +inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle, + gpusize size, + gpusize minimum_address, + gpusize maximum_address, + gpusize *out_addr) { + D3DDDI_RESERVEGPUVIRTUALADDRESS args{}; + args.hPagingQueue = handle; + args.Size = size; + args.MinimumAddress = minimum_address; + args.MaximumAddress = maximum_address; + + auto code = ReserveGpuVirtualAddress(&args); + if (code == ErrorCode::Success) + *out_addr = args.VirtualAddress; + return code; +} + +inline ErrorCode FreeGpuVirtualAddress(FreeGpuVirtualAddressArgs *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTFreeGpuVirtualAddress(args))); +} + +inline ErrorCode FreeGpuVirtualAddress(WinAdapterHandle handle, + gpusize base_address, + gpusize size) { + FreeGpuVirtualAddressArgs args{}; + args.hAdapter = handle; + args.Size = size; + args.BaseAddress = base_address; + return FreeGpuVirtualAddress(&args); +} + +inline ErrorCode MakeResident(D3DDDI_MAKERESIDENT *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTMakeResident(args))); +} + +inline ErrorCode Evict(EvictArgs *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTEvict(args))); +} + +inline ErrorCode ShareObjects(size_t num_allocations, + WinResourceHandle resource, + uint32_t flags, + int* dmabuf_fd) { + OBJECT_ATTRIBUTES obj_attr; + HANDLE nt_handle; + ErrorCode ret; + + InitializeObjectAttributes(&obj_attr, nullptr, OBJ_INHERIT, nullptr, nullptr); + ret = TranslateNtStatus(DXCORE_CALL(D3DKMTShareObjects(num_allocations, + &resource, &obj_attr, flags, &nt_handle))); + if (ret == ErrorCode::Success) + *dmabuf_fd = *(reinterpret_cast(&nt_handle)); + else + *dmabuf_fd = -1; + + return ret; +} + +inline ErrorCode QueryResourceInfoFromNtHandle(D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTQueryResourceInfoFromNtHandle(args))); +} + +inline ErrorCode OpenResourceFromNtHandle(D3DKMT_OPENRESOURCEFROMNTHANDLE *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTOpenResourceFromNtHandle(args))); +} + +} // namespace d3dthunk +} // namespace thunk +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/types.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/types.h new file mode 100644 index 0000000000..0a3ca35ebc --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/types.h @@ -0,0 +1,101 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_TYPES_H_ +#define _WSL_INC_WDDM_TYPES_H_ + +#include +#include +#include "impl/thunk_proxy/wddm_types.h" +// windows wchar is 16bit, but linux is 32bit +// seems libdxcore (not dxgkrnl.ko) convert thunk windows wchar to linux one +// so only accept 32bit wchar args. note driver private data structure still +// use 16bit wchar +#define WCHAR wchar_t +#define PCWSTR const wchar_t * +#include +#undef WCHAR +#undef PCWSTR + +using gpusize = uint64_t; // Used to specify GPU addresses and sizes of GPU allocations +using WinAllocationHandle = D3DKMT_HANDLE; +using WinResourceHandle = D3DKMT_HANDLE; +using WinContextHandle = D3DKMT_HANDLE; +using WinDeviceHandle = D3DKMT_HANDLE; +using WinAdapterHandle = D3DKMT_HANDLE; + +//reference dk/winnt.h +#define STANDARD_RIGHTS_REQUIRED (0x000F0000L) + +//reference dk/ntdef.h +#define OBJ_INHERIT (0x00000002L) +typedef WCHAR *PWCHAR, *LPWCH, *PWCH; +typedef struct _UNICODE_STRING { + USHORT Length; + USHORT MaximumLength; +#ifdef MIDL_PASS + [size_is(MaximumLength / 2), length_is((Length) / 2) ] USHORT * Buffer; +#else // MIDL_PASS + _Field_size_bytes_part_opt_(MaximumLength, Length) PWCH Buffer; +#endif // MIDL_PASS +} UNICODE_STRING; +typedef UNICODE_STRING *PUNICODE_STRING; +typedef const UNICODE_STRING *PCUNICODE_STRING; + +typedef struct _OBJECT_ATTRIBUTES { + ULONG Length; + HANDLE RootDirectory; + PUNICODE_STRING ObjectName; + ULONG Attributes; + PVOID SecurityDescriptor; + PVOID SecurityQualityOfService; +} OBJECT_ATTRIBUTES; +#define InitializeObjectAttributes( p, n, a, r, s ) { \ + (p)->Length = sizeof( OBJECT_ATTRIBUTES ); \ + (p)->RootDirectory = r; \ + (p)->Attributes = a; \ + (p)->ObjectName = n; \ + (p)->SecurityDescriptor = s; \ + (p)->SecurityQualityOfService = NULL; \ + } + +#endif \ No newline at end of file diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/va_mgr.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/va_mgr.h new file mode 100644 index 0000000000..675bfc3e39 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/va_mgr.h @@ -0,0 +1,86 @@ +#ifndef _WSL_INC_WDDM_VA_MGR_H_ +#define _WSL_INC_WDDM_VA_MGR_H_ + +#include +#include +#include "util/utils.h" + +namespace wsl { +namespace thunk { + +class VaMgr { +public: + VaMgr(uint64_t start, uint64_t size, uint64_t min_align); + ~VaMgr(); + + /* Allocate `bytes` VA, if `align` is not zero, the returned address is aligned by `align`. + * If `addr` parameter is not zero, try best to allocate VA from fixed address `addr`. + */ + uint64_t Alloc(uint64_t bytes, uint64_t align, uint64_t addr = 0); + + void Free(uint64_t addr); + +private: + uint64_t AllocImpl(uint64_t bytes, uint64_t align); + + struct Fragment { + using ptr = std::multimap::iterator; + ptr free_list_entry_; + + struct { + uint64_t size : 63; + bool is_free : 1; + }; + + Fragment() : size(0), is_free(false) {} + Fragment(ptr iterator, uint64_t len, bool is_free) + : free_list_entry_(iterator), size(len), is_free(is_free) {} + }; + + static inline Fragment make_fragment(typename Fragment::ptr iter, uint64_t len) { + return {iter, len, true}; + } + + inline Fragment make_fragment(uint64_t len) { return {free_list_.end(), len, false}; } + + static inline bool is_free(const Fragment& f) { return f.is_free; } + void set_used(Fragment& f) { + f.is_free = false; + f.free_list_entry_ = free_list_.end(); + } + static void set_free(Fragment& f, typename Fragment::ptr iter) { + f.free_list_entry_ = iter; + f.is_free = true; + } + + inline void remove_free_list_entry(Fragment& frag) { + if (frag.free_list_entry_ != free_list_.end()) { + free_list_.erase(frag.free_list_entry_); + frag.free_list_entry_ = free_list_.end(); + } + } + + inline void add_free_fragment(uint64_t size, uint64_t base) { + auto it = free_list_.insert(std::make_pair(size, base)); + frag_map_[base] = make_fragment(it, size); + } + + inline void add_used_fragment(uint64_t size, uint64_t base) { + frag_map_[base] = make_fragment(size); + } + // Indexed by size + std::multimap free_list_; + // Indexed by VA, each fragment has no overlap + std::map frag_map_; + + uint64_t min_align_; + + std::mutex lock_; // Mutex protecting allocation and free of va + + + DISALLOW_COPY_AND_ASSIGN(VaMgr); +}; + +} // namespace thunk +} // namespace wsl +#endif