diff --git a/projects/rocr-runtime/libhsakmt/CMakeLists.txt b/projects/rocr-runtime/libhsakmt/CMakeLists.txt index 25b3af4af8..be539e2553 100644 --- a/projects/rocr-runtime/libhsakmt/CMakeLists.txt +++ b/projects/rocr-runtime/libhsakmt/CMakeLists.txt @@ -25,6 +25,9 @@ cmake_minimum_required ( VERSION 3.6.3 ) +if (WIN_SDK) + include(${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists_wsl.txt) +else () set(CMAKE_VERBOSE_MAKEFILE ON) set ( HSAKMT "hsakmt" ) @@ -319,3 +322,4 @@ endif() ########################### # Use component packaging set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.") +endif() diff --git a/projects/rocr-runtime/libhsakmt/CMakeLists_wsl.txt b/projects/rocr-runtime/libhsakmt/CMakeLists_wsl.txt new file mode 100644 index 0000000000..e07f9f9932 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/CMakeLists_wsl.txt @@ -0,0 +1,309 @@ +################################################################################ +## +## Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. +## +## MIT LICENSE: +## Permission is hereby granted, free of charge, to any person obtaining a copy of +## this software and associated documentation files (the "Software"), to deal in +## the Software without restriction, including without limitation the rights to +## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +## of the Software, and to permit persons to whom the Software is furnished to do +## so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in all +## copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +## SOFTWARE. +## +################################################################################ + +cmake_minimum_required ( VERSION 3.15 ) + +set(CMAKE_VERBOSE_MAKEFILE ON) + +set ( ROCDXG "rocdxg" ) +set ( ROCDXG_PACKAGE "rocdxg-roct" ) +set ( ROCDXG_COMPONENT "lib${ROCDXG}" ) +set ( ROCDXG_TARGET "${ROCDXG}" ) +set ( ROCDXG_VERSION "1.1.0") + +project ( ${ROCDXG_TARGET} VERSION ${ROCDXG_VERSION} ) +# Project/version initialized; expose version to code via target defs below + +# Optionally, build ROCDXG with ccache. +set(ROCM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build") +if (ROCM_CCACHE_BUILD) + find_program(CCACHE_PROGRAM ccache) + if (CCACHE_PROGRAM) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PROGRAM}) + else() + message(WARNING "Unable to find ccache. Falling back to real compiler") + endif() # if (CCACHE_PROGRAM) +endif() # if (ROCM_CCACHE_BUILD) + +list( PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules" ) + +## Include common cmake modules +include ( utils ) +include ( GNUInstallDirs ) + +## Setup the package version. +get_version ( "${ROCDXG_VERSION}" ) + +set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} ) +set ( BUILD_VERSION_MINOR ${VERSION_MINOR} ) +set ( BUILD_VERSION_PATCH ${VERSION_PATCH} ) + +set ( LIB_VERSION_MAJOR ${VERSION_MAJOR}) +set ( LIB_VERSION_MINOR ${VERSION_MINOR}) +set ( LIB_VERSION_PATCH ${VERSION_PATCH}) + +set ( LIB_VERSION_STRING "${LIB_VERSION_MAJOR}.${LIB_VERSION_MINOR}.${LIB_VERSION_PATCH}" ) + +if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" ) + message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" ) + set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" ) +endif () +set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) + +## Compiler flags +set (ROCDXG_CXX_FLAGS -fPIC -include ${CMAKE_CURRENT_SOURCE_DIR}/src/dxg/librocdxg.h) + +if ( CMAKE_COMPILER_IS_GNUCC ) + set ( ROCDXG_CXX_FLAGS "${ROCDXG_CXX_FLAGS}" -Wlogical-op) +endif () +if ( ${ROCDXG_WERROR} ) + set ( ROCDXG_CXX_FLAGS "${ROCDXG_CXX_FLAGS}" -Werror ) +endif () +if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release ) + set ( ROCDXG_CXX_FLAGS "${ROCDXG_CXX_FLAGS}" -O2 ) +else () + set ( ROCDXG_CXX_FLAGS "${ROCDXG_CXX_FLAGS}" -g ) +endif () + +set ( ROCDXG_LINKER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/src/dxg/librocdxg.ver" ) + +## Linker Flags +## Add --enable-new-dtags to generate DT_RUNPATH +set (ROCDXG_LINK_FLAGS "${ROCDXG_LINK_FLAGS} -Wl,--enable-new-dtags -Wl,--version-script=${ROCDXG_LINKER_SCRIPT} -Wl,-soname=${ROCDXG_COMPONENT}.so.${LIB_VERSION_MAJOR} -Wl,-z,nodelete") + +## Linker undefined symbol handling +if ( CMAKE_COMPILER_IS_GNUCC ) + set ( ROCDXG_LINK_FLAGS "${ROCDXG_LINK_FLAGS} -Wl,-no-undefined" ) +else () + set ( ROCDXG_LINK_FLAGS "${ROCDXG_LINK_FLAGS} -Wl,-undefined,error" ) +endif () + +## Source files +set ( ROCDXG_SRC "src/dxg/debug.cpp" + "src/dxg/events.cpp" + "src/dxg/memory.cpp" + "src/dxg/libdrm.cpp" + "src/dxg/hsa.cpp" + "src/dxg/openclose.cpp" + "src/dxg/perfctr.cpp" + "src/dxg/queues.cpp" + "src/dxg/time.cpp" + "src/dxg/topology.cpp" + "src/dxg/spm.cpp" + "src/dxg/version.cpp" + "src/dxg/svm.cpp" + "src/dxg/pc_sampling.cpp" + "src/dxg/hsakmtmodel.cpp" + "src/dxg/dxcore_loader.cpp" + "src/dxg/ais.cpp" + "src/dxg/wddm/device.cpp" + "src/dxg/wddm/gpu_memory.cpp" + "src/dxg/wddm/va_mgr.cpp" + "src/dxg/wddm/queue.cpp" + "src/dxg/wddm/cmd_util.cpp" ) + +## Declare the library target name +add_library (${ROCDXG_TARGET} SHARED "") + +## Add sources +target_sources ( ${ROCDXG_TARGET} PRIVATE ${ROCDXG_SRC} ) + +## Add headers. The public headers need to point at their location in both build and install +## directory layouts. This declaration allows publishing library use data to downstream clients. +target_include_directories( ${ROCDXG_TARGET} + PUBLIC + $ + $ + PRIVATE + ${WIN_SDK} + ${CMAKE_CURRENT_SOURCE_DIR}/src/dxg ) + +add_compile_definitions(LINUX __AMD64__ LITTLEENDIAN_CPU HSA_LARGE_MODEL) + +# Ensure version macro is defined for this target +target_compile_definitions(${ROCDXG_TARGET} PRIVATE ROCDXG_VERSION="${ROCDXG_VERSION}") + +target_link_directories(${ROCDXG_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/dxg/thunk_proxy) +target_link_libraries(${ROCDXG_TARGET} PRIVATE thunk_proxy) + +set_property(TARGET ${ROCDXG_TARGET} PROPERTY LINK_FLAGS ${ROCDXG_LINK_FLAGS}) + +## Set the VERSION and SOVERSION values +set_property ( TARGET ${ROCDXG_TARGET} PROPERTY VERSION "${LIB_VERSION_STRING}" ) +set_property ( TARGET ${ROCDXG_TARGET} PROPERTY SOVERSION "${LIB_VERSION_MAJOR}" ) + +find_package(PkgConfig) +# get OS-info for OS-specific build dependencies +get_os_info() +# Check for libraries required for building +find_library(LIBC NAMES c REQUIRED) +message(STATUS "LIBC:" ${LIBC}) + +target_link_libraries ( ${ROCDXG_TARGET} + PRIVATE pthread rt c ${CMAKE_DL_LIBS} +) + +target_compile_options(${ROCDXG_TARGET} PRIVATE ${ROCDXG_CXX_FLAGS}) + +## Define default paths and packages. +if( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) + set ( CMAKE_INSTALL_PREFIX "/opt/rocm" ) +endif() +set ( CMAKE_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} CACHE STRING "Default installation directory." FORCE ) +set ( CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" CACHE STRING "Default packaging prefix." ) +set ( CPACK_GENERATOR "DEB" CACHE STRING "Default packaging generators." ) + +# Installs binaries and exports the library usage data to ${ROCDXG_TARGET}Targets +install ( TARGETS ${ROCDXG_TARGET} EXPORT ${ROCDXG_TARGET}Targets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary ) + +# Install public headers +#install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/${ROCDXG_TARGET} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +# COMPONENT dev PATTERN "*drm*" EXCLUDE ) + +# Record our usage data for clients find_package calls. +install ( EXPORT ${ROCDXG_TARGET}Targets + FILE ${ROCDXG_TARGET}Targets.cmake + NAMESPACE ${ROCDXG_TARGET}:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${ROCDXG_TARGET} + COMPONENT dev ) + +# Adds the target alias rocdxg::rocdxg to the local cmake cache. +# This isn't necessary today. It's harmless preparation for some +# hypothetical future in which the we might be included by add_subdirectory() +# in some other project's cmake file. It allows uniform use of find_package +# and target_link_library() without regard to whether a target is external or +# a subdirectory of the current build. +add_library( ${ROCDXG_TARGET}::${ROCDXG_TARGET} ALIAS ${ROCDXG_TARGET} ) + +# Create cmake configuration files +include(CMakePackageConfigHelpers) + +configure_package_config_file(${ROCDXG_TARGET}-config.cmake.in + ${ROCDXG_TARGET}-config.cmake + INSTALL_DESTINATION + ${CMAKE_INSTALL_LIBDIR}/cmake/${ROCDXG_TARGET} ) + +write_basic_package_version_file(${ROCDXG_TARGET}-config-version.cmake + VERSION ${BUILD_VERSION_STRING} + COMPATIBILITY + AnyNewerVersion) + +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/${ROCDXG_TARGET}-config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/${ROCDXG_TARGET}-config-version.cmake + DESTINATION + ${CMAKE_INSTALL_LIBDIR}/cmake/${ROCDXG_TARGET} + COMPONENT dev ) + +# Optionally record the package's find module in the user's package cache. +if ( NOT DEFINED EXPORT_TO_USER_PACKAGE_REGISTRY ) + set ( EXPORT_TO_USER_PACKAGE_REGISTRY "off" ) +endif() +set ( EXPORT_TO_USER_PACKAGE_REGISTRY ${EXPORT_TO_USER_PACKAGE_REGISTRY} + CACHE BOOL "Add cmake package config location to the user's cmake package registry.") +if(${EXPORT_TO_USER_PACKAGE_REGISTRY}) + # Enable writing to the registry + set(CMAKE_EXPORT_PACKAGE_REGISTRY ON) + # Generate a target file for the build + export(TARGETS ${ROCDXG_TARGET} NAMESPACE ${ROCDXG_TARGET}:: FILE ${ROCDXG_TARGET}Targets.cmake) + # Record the package in the user's cache. + export(PACKAGE ${ROCDXG_TARGET}) +endif() + +# Since librocdxg.pc and libhsakmt.pc are installed to the same pkgconfig directory, +# we can directly use libhsakmt's header file path in the includedir. +# This allows librocdxg to reference the same header files as libhsakmt without +# duplicating header installation. +configure_file ( librocdxg.pc.in librocdxg.pc @ONLY ) + +install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/librocdxg.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig COMPONENT dev ) + +install(CODE "execute_process(COMMAND ldconfig)" COMPONENT binary) + +########################### +# Packaging directives +########################### +# Use component packaging +set(CPACK_COMPONENTS_GROUPING IGNORE) +set(CPACK_DEB_COMPONENT_INSTALL ON) +set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") +set(CPACK_PACKAGE_VERSION_MAJOR ${VERSION_MAJOR}) +set(CPACK_PACKAGE_VERSION_MINOR ${VERSION_MINOR}) +set(CPACK_PACKAGE_VERSION_PATCH ${VERSION_PATCH}) +set(CPACK_PACKAGE_CONTACT "AMD GFX mailing list ") +set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md") +set(CPACK_COMPONENT_DESCRIPTION "ROCDXG development package.\n This package includes the user-mode API interfaces\nused to interact with the ROCm driver.\n This package contains the libraries and cmake files for the ROCDXG package.") +set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.") + +# Install License file +install ( FILES ${CPACK_RESOURCE_FILE_LICENSE} DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary ) + +# Prepare final version for the CPACK use +set(PACKAGE_VERSION_STR "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") +set(CPACK_PACKAGE_VERSION "${PACKAGE_VERSION_STR}") + +# Debian package specific variables +set(CPACK_DEBIAN_PACKAGE_NAME "rocdxg-roct") + +# Debian binary package specific variables (runtime package) +set(CPACK_DEBIAN_BINARY_PACKAGE_NAME "rocdxg-roct") +set(CPACK_DEBIAN_BINARY_PACKAGE_DESCRIPTION "ROCDXG runtime package containing libraries") + +# Debian dev package specific variables +set(CPACK_DEBIAN_DEV_PACKAGE_NAME "rocdxg-roct-dev") +set(CPACK_DEBIAN_DEV_PACKAGE_DESCRIPTION "ROCDXG development package containing pkgconfig and cmake files") + +## Process the Debian install/remove scripts to update the CPACK variables +configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in DEBIAN/postinst @ONLY ) +configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in DEBIAN/prerm @ONLY ) +set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "DEBIAN/postinst;DEBIAN/prerm") + +# Setting package dependencies +set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core") +set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "rocm-core") +set(CPACK_DEBIAN_DEV_PACKAGE_DEPENDS "rocdxg-roct (= ${PACKAGE_VERSION_STR}), rocm-core") + +# Set the names now using CPACK utility +set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") + +# Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake +if(NOT ROCM_DEP_ROCMCORE) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS ${CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_DEV_PACKAGE_DEPENDS ${CPACK_DEBIAN_DEV_PACKAGE_DEPENDS}) +endif() + +include(CPack) + +# Add component descriptions +cpack_add_component(binary + DISPLAY_NAME "Runtime" + DESCRIPTION "ROCDXG runtime libraries") + +cpack_add_component(dev + DISPLAY_NAME "Development" + DESCRIPTION "ROCDXG development files (pkgconfig and cmake)") diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/amdgpu.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/amdgpu.h new file mode 100644 index 0000000000..2b56bd3aac --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/amdgpu.h @@ -0,0 +1,2171 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +/** + * \file amdgpu.h + * + * Declare public libdrm_amdgpu API + * + * This file define API exposed by libdrm_amdgpu library. + * User wanted to use libdrm_amdgpu functionality must include + * this file. + * + */ +#ifndef _AMDGPU_H_ +#define _AMDGPU_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct drm_amdgpu_info_hw_ip; +struct drm_amdgpu_bo_list_entry; +struct drm_amdgpu_capability; + +/*--------------------------------------------------------------------------*/ +/* --------------------------- Defines ------------------------------------ */ +/*--------------------------------------------------------------------------*/ + +/** + * Define max. number of Command Buffers (IB) which could be sent to the single + * hardware IP to accommodate CE/DE requirements + * + * \sa amdgpu_cs_ib_info +*/ +#define AMDGPU_CS_MAX_IBS_PER_SUBMIT 4 + +/** + * Special timeout value meaning that the timeout is infinite. + */ +#define AMDGPU_TIMEOUT_INFINITE 0xffffffffffffffffull + +/** + * Used in amdgpu_cs_query_fence_status(), meaning that the given timeout + * is absolute. + */ +#define AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE (1 << 0) + +/*--------------------------------------------------------------------------*/ +/* ----------------------------- Enums ------------------------------------ */ +/*--------------------------------------------------------------------------*/ + +/** + * Enum describing possible handle types + * + * \sa amdgpu_bo_import, amdgpu_bo_export + * +*/ +enum amdgpu_bo_handle_type { + /** GEM flink name (needs DRM authentication, used by DRI2) */ + amdgpu_bo_handle_type_gem_flink_name = 0, + + /** KMS handle which is used by all driver ioctls */ + amdgpu_bo_handle_type_kms = 1, + + /** DMA-buf fd handle */ + amdgpu_bo_handle_type_dma_buf_fd = 2, + + /** Deprecated in favour of and same behaviour as + * amdgpu_bo_handle_type_kms, use that instead of this + */ + amdgpu_bo_handle_type_kms_noimport = 3, +}; + +/** Define known types of GPU VM VA ranges */ +enum amdgpu_gpu_va_range +{ + /** Allocate from "normal"/general range */ + amdgpu_gpu_va_range_general = 0 +}; + +enum amdgpu_sw_info { + amdgpu_sw_info_address32_hi = 0, +}; + +/*--------------------------------------------------------------------------*/ +/* -------------------------- Datatypes ----------------------------------- */ +/*--------------------------------------------------------------------------*/ + +/** + * Define opaque pointer to context associated with fd. + * This context will be returned as the result of + * "initialize" function and should be pass as the first + * parameter to any API call + */ +typedef struct amdgpu_device *amdgpu_device_handle; + +/** + * Define GPU Context type as pointer to opaque structure + * Example of GPU Context is the "rendering" context associated + * with OpenGL context (glCreateContext) + */ +typedef struct amdgpu_context *amdgpu_context_handle; + +/** + * Define handle for amdgpu resources: buffer, GDS, etc. + */ +typedef struct amdgpu_bo *amdgpu_bo_handle; + +/** + * Define handle for list of BOs + */ +typedef struct amdgpu_bo_list *amdgpu_bo_list_handle; + +/** + * Define handle to be used to work with VA allocated ranges + */ +typedef struct amdgpu_va *amdgpu_va_handle; + +/** + * Define handle dealing with VA allocation. An amdgpu_device + * owns one of these, but they can also be used without a device. + */ +typedef struct amdgpu_va_manager *amdgpu_va_manager_handle; + +/** + * Define handle for semaphore + */ +typedef struct amdgpu_semaphore *amdgpu_semaphore_handle; + +/** + * Define handle for sem file + */ +typedef uint32_t amdgpu_sem_handle; + + +/*--------------------------------------------------------------------------*/ +/* -------------------------- Structures ---------------------------------- */ +/*--------------------------------------------------------------------------*/ + +/** + * Structure describing memory allocation request + * + * \sa amdgpu_bo_alloc() + * +*/ +struct amdgpu_bo_alloc_request { + /** Allocation request. It must be aligned correctly. */ + uint64_t alloc_size; + + /** + * It may be required to have some specific alignment requirements + * for physical back-up storage (e.g. for displayable surface). + * If 0 there is no special alignment requirement + */ + uint64_t phys_alignment; + + /** + * UMD should specify where to allocate memory and how it + * will be accessed by the CPU. + */ + uint32_t preferred_heap; + + /** Additional flags passed on allocation */ + uint64_t flags; +}; + +/** + * Special UMD specific information associated with buffer. + * + * It may be need to pass some buffer charactersitic as part + * of buffer sharing. Such information are defined UMD and + * opaque for libdrm_amdgpu as well for kernel driver. + * + * \sa amdgpu_bo_set_metadata(), amdgpu_bo_query_info, + * amdgpu_bo_import(), amdgpu_bo_export + * +*/ +struct amdgpu_bo_metadata { + /** Special flag associated with surface */ + uint64_t flags; + + union { + /** + * ASIC-specific tiling information (also used by DCE). + * The encoding is defined by the AMDGPU_TILING_* definitions. + */ + uint64_t tiling_info; + /** + * ASIC-specific swizzle information. + * The encoding is defined by the AMDGPU_SWIZZLE_* definitions. + */ + uint64_t swizzle_info; + }; + + /** Size of metadata associated with the buffer, in bytes. */ + uint32_t size_metadata; + + /** UMD specific metadata. Opaque for kernel */ + uint32_t umd_metadata[64]; +}; + +/** + * Structure describing allocated buffer. Client may need + * to query such information as part of 'sharing' buffers mechanism + * + * \sa amdgpu_bo_set_metadata(), amdgpu_bo_query_info(), + * amdgpu_bo_import(), amdgpu_bo_export() +*/ +struct amdgpu_bo_info { + /** Allocated memory size */ + uint64_t alloc_size; + + /** + * It may be required to have some specific alignment requirements + * for physical back-up storage. + */ + uint64_t phys_alignment; + + /** Heap where to allocate memory. */ + uint32_t preferred_heap; + + /** Additional allocation flags. */ + uint64_t alloc_flags; + + /** Metadata associated with buffer if any. */ + struct amdgpu_bo_metadata metadata; +}; + +/** + * Structure with information about "imported" buffer + * + * \sa amdgpu_bo_import() + * + */ +struct amdgpu_bo_import_result { + /** Handle of memory/buffer to use */ + amdgpu_bo_handle buf_handle; + + /** Buffer size */ + uint64_t alloc_size; +}; + +/** + * + * Structure to describe GDS partitioning information. + * \note OA and GWS resources are asscoiated with GDS partition + * + * \sa amdgpu_gpu_resource_query_gds_info + * +*/ +struct amdgpu_gds_resource_info { + uint32_t gds_gfx_partition_size; + uint32_t compute_partition_size; + uint32_t gds_total_size; + uint32_t gws_per_gfx_partition; + uint32_t gws_per_compute_partition; + uint32_t oa_per_gfx_partition; + uint32_t oa_per_compute_partition; +}; + +/** + * Structure describing CS fence + * + * \sa amdgpu_cs_query_fence_status(), amdgpu_cs_request, amdgpu_cs_submit() + * +*/ +struct amdgpu_cs_fence { + + /** In which context IB was sent to execution */ + amdgpu_context_handle context; + + /** To which HW IP type the fence belongs */ + uint32_t ip_type; + + /** IP instance index if there are several IPs of the same type. */ + uint32_t ip_instance; + + /** Ring index of the HW IP */ + uint32_t ring; + + /** Specify fence for which we need to check submission status.*/ + uint64_t fence; +}; + +/** + * Structure describing IB + * + * \sa amdgpu_cs_request, amdgpu_cs_submit() + * +*/ +struct amdgpu_cs_ib_info { + /** Special flags */ + uint64_t flags; + + /** Virtual MC address of the command buffer */ + uint64_t ib_mc_address; + + /** + * Size of Command Buffer to be submitted. + * - The size is in units of dwords (4 bytes). + * - Could be 0 + */ + uint32_t size; +}; + +/** + * Structure describing fence information + * + * \sa amdgpu_cs_request, amdgpu_cs_query_fence, + * amdgpu_cs_submit(), amdgpu_cs_query_fence_status() +*/ +struct amdgpu_cs_fence_info { + /** buffer object for the fence */ + amdgpu_bo_handle handle; + + /** fence offset in the unit of sizeof(uint64_t) */ + uint64_t offset; +}; + +/** + * Structure describing submission request + * + * \note We could have several IBs as packet. e.g. CE, CE, DE case for gfx + * + * \sa amdgpu_cs_submit() +*/ +struct amdgpu_cs_request { + /** Specify flags with additional information */ + uint64_t flags; + + /** Specify HW IP block type to which to send the IB. */ + unsigned ip_type; + + /** IP instance index if there are several IPs of the same type. */ + unsigned ip_instance; + + /** + * Specify ring index of the IP. We could have several rings + * in the same IP. E.g. 0 for SDMA0 and 1 for SDMA1. + */ + uint32_t ring; + + /** + * List handle with resources used by this request. + */ + amdgpu_bo_list_handle resources; + + /** + * Number of dependencies this Command submission needs to + * wait for before starting execution. + */ + uint32_t number_of_dependencies; + + /** + * Array of dependencies which need to be met before + * execution can start. + */ + struct amdgpu_cs_fence *dependencies; + + /** Number of IBs to submit in the field ibs. */ + uint32_t number_of_ibs; + + /** + * IBs to submit. Those IBs will be submit together as single entity + */ + struct amdgpu_cs_ib_info *ibs; + + /** + * The returned sequence number for the command submission + */ + uint64_t seq_no; + + /** + * The fence information + */ + struct amdgpu_cs_fence_info fence_info; +}; + +/** + * Structure which provide information about GPU VM MC Address space + * alignments requirements + * + * \sa amdgpu_query_buffer_size_alignment + */ +struct amdgpu_buffer_size_alignments { + /** Size alignment requirement for allocation in + * local memory */ + uint64_t size_local; + + /** + * Size alignment requirement for allocation in remote memory + */ + uint64_t size_remote; +}; + +/** + * Structure which provide information about heap + * + * \sa amdgpu_query_heap_info() + * + */ +struct amdgpu_heap_info { + /** Theoretical max. available memory in the given heap */ + uint64_t heap_size; + + /** + * Number of bytes allocated in the heap. This includes all processes + * and private allocations in the kernel. It changes when new buffers + * are allocated, freed, and moved. It cannot be larger than + * heap_size. + */ + uint64_t heap_usage; + + /** + * Theoretical possible max. size of buffer which + * could be allocated in the given heap + */ + uint64_t max_allocation; +}; + +/** + * Describe GPU h/w info needed for UMD correct initialization + * + * \sa amdgpu_query_gpu_info() +*/ +struct amdgpu_gpu_info { + /** Asic id */ + uint32_t asic_id; + /** Chip revision */ + uint32_t chip_rev; + /** Chip external revision */ + uint32_t chip_external_rev; + /** Family ID */ + uint32_t family_id; + /** Special flags */ + uint64_t ids_flags; + /** max engine clock*/ + uint64_t max_engine_clk; + /** max memory clock */ + uint64_t max_memory_clk; + /** number of shader engines */ + uint32_t num_shader_engines; + /** number of shader arrays per engine */ + uint32_t num_shader_arrays_per_engine; + /** Number of available good shader pipes */ + uint32_t avail_quad_shader_pipes; + /** Max. number of shader pipes.(including good and bad pipes */ + uint32_t max_quad_shader_pipes; + /** Number of parameter cache entries per shader quad pipe */ + uint32_t cache_entries_per_quad_pipe; + /** Number of available graphics context */ + uint32_t num_hw_gfx_contexts; + /** Number of render backend pipes */ + uint32_t rb_pipes; + /** Enabled render backend pipe mask */ + uint32_t enabled_rb_pipes_mask; + /** Frequency of GPU Counter */ + uint32_t gpu_counter_freq; + /** CC_RB_BACKEND_DISABLE.BACKEND_DISABLE per SE */ + uint32_t backend_disable[4]; + /** Value of MC_ARB_RAMCFG register*/ + uint32_t mc_arb_ramcfg; + /** Value of GB_ADDR_CONFIG */ + uint32_t gb_addr_cfg; + /** Values of the GB_TILE_MODE0..31 registers */ + uint32_t gb_tile_mode[32]; + /** Values of GB_MACROTILE_MODE0..15 registers */ + uint32_t gb_macro_tile_mode[16]; + /** Value of PA_SC_RASTER_CONFIG register per SE */ + uint32_t pa_sc_raster_cfg[4]; + /** Value of PA_SC_RASTER_CONFIG_1 register per SE */ + uint32_t pa_sc_raster_cfg1[4]; + /* CU info */ + uint32_t cu_active_number; + uint32_t cu_ao_mask; + uint32_t cu_bitmap[4][4]; + /* video memory type info*/ + uint32_t vram_type; + /* video memory bit width*/ + uint32_t vram_bit_width; + /** constant engine ram size*/ + uint32_t ce_ram_size; + /* vce harvesting instance */ + uint32_t vce_harvest_config; + /* PCI revision ID */ + uint32_t pci_rev_id; +}; + + +/*--------------------------------------------------------------------------*/ +/*------------------------- Functions --------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +/* + * Initialization / Cleanup + * +*/ + +/** + * + * \param fd - \c [in] File descriptor for AMD GPU device + * received previously as the result of + * e.g. drmOpen() call. + * For legacy fd type, the DRI2/DRI3 + * authentication should be done before + * calling this function. + * \param major_version - \c [out] Major version of library. It is assumed + * that adding new functionality will cause + * increase in major version + * \param minor_version - \c [out] Minor version of library + * \param device_handle - \c [out] Pointer to opaque context which should + * be passed as the first parameter on each + * API call + * + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * + * \sa amdgpu_device_deinitialize() +*/ +int amdgpu_device_initialize(int fd, + uint32_t *major_version, + uint32_t *minor_version, + amdgpu_device_handle *device_handle); + +/** + * Same as amdgpu_device_initialize() except when deduplicate_device + * is false *and* fd points to a device that was already initialized. + * In this case, amdgpu_device_initialize would return the same + * amdgpu_device_handle while here amdgpu_device_initialize2 would + * return a new handle. + * amdgpu_device_initialize() should be preferred in most situations; + * the only use-case where not-deduplicating devices make sense is + * when one wants to have isolated device handles in the same process. + */ +int amdgpu_device_initialize2(int fd, bool deduplicate_device, + uint32_t *major_version, + uint32_t *minor_version, + amdgpu_device_handle *device_handle); +/** + * + * When access to such library does not needed any more the special + * function must be call giving opportunity to clean up any + * resources if needed. + * + * \param device_handle - \c [in] Context associated with file + * descriptor for AMD GPU device + * received previously as the + * result e.g. of drmOpen() call. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_device_initialize() + * +*/ +int amdgpu_device_deinitialize(amdgpu_device_handle device_handle); + +/** + * + * /param device_handle - \c [in] Device handle. + * See #amdgpu_device_initialize() + * + * \return Returns the drm fd used for operations on this + * device. This is still owned by the library and hence + * should not be closed. Guaranteed to be valid until + * #amdgpu_device_deinitialize gets called. + * +*/ +int amdgpu_device_get_fd(amdgpu_device_handle device_handle); + +/* + * Memory Management + * +*/ + +/** + * Allocate memory to be used by UMD for GPU related operations + * + * \param dev - \c [in] Device handle. + * See #amdgpu_device_initialize() + * \param alloc_buffer - \c [in] Pointer to the structure describing an + * allocation request + * \param buf_handle - \c [out] Allocated buffer handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_bo_free() +*/ +int amdgpu_bo_alloc(amdgpu_device_handle dev, + struct amdgpu_bo_alloc_request *alloc_buffer, + amdgpu_bo_handle *buf_handle); + +/** + * Associate opaque data with buffer to be queried by another UMD + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param buf_handle - \c [in] Buffer handle + * \param info - \c [in] Metadata to associated with buffer + * + * \return 0 on success\n + * <0 - Negative POSIX Error code +*/ +int amdgpu_bo_set_metadata(amdgpu_bo_handle buf_handle, + struct amdgpu_bo_metadata *info); + +/** + * Query buffer information including metadata previusly associated with + * buffer. + * + * \param dev - \c [in] Device handle. + * See #amdgpu_device_initialize() + * \param buf_handle - \c [in] Buffer handle + * \param info - \c [out] Structure describing buffer + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_bo_set_metadata(), amdgpu_bo_alloc() +*/ +int amdgpu_bo_query_info(amdgpu_bo_handle buf_handle, + struct amdgpu_bo_info *info); + +/** + * Allow others to get access to buffer + * + * \param dev - \c [in] Device handle. + * See #amdgpu_device_initialize() + * \param buf_handle - \c [in] Buffer handle + * \param type - \c [in] Type of handle requested + * \param shared_handle - \c [out] Special "shared" handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_bo_import() + * +*/ +int amdgpu_bo_export(amdgpu_bo_handle buf_handle, + enum amdgpu_bo_handle_type type, + uint32_t *shared_handle); + +/** + * Request access to "shared" buffer + * + * \param dev - \c [in] Device handle. + * See #amdgpu_device_initialize() + * \param type - \c [in] Type of handle requested + * \param shared_handle - \c [in] Shared handle received as result "import" + * operation + * \param output - \c [out] Pointer to structure with information + * about imported buffer + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \note Buffer must be "imported" only using new "fd" (different from + * one used by "exporter"). + * + * \sa amdgpu_bo_export() + * +*/ +int amdgpu_bo_import(amdgpu_device_handle dev, + enum amdgpu_bo_handle_type type, + uint32_t shared_handle, + struct amdgpu_bo_import_result *output); + +/** + * Allow others to get access to crtc's framebuffer + * + * \param dev - \c [in] Device handle. + * See #amdgpu_device_initialize() + * \param fb_id - \c [out] the first crtc's framebuffer's buffer_id + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_get_fb_id() + * +*/ +int amdgpu_get_fb_id(amdgpu_device_handle dev, unsigned int *fb_id); + +/** + * Get the framebuffer's bo by fb_id + * + * \param dev - \c [in] Device handle. + * See #amdgpu_device_initialize() + * \param fb_id - \c [in] the framebuffer's buffer_id + * + * \param output - \c [output] the bo of fb_id + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_get_bo_from_fb_id() + * +*/ +int amdgpu_get_bo_from_fb_id(amdgpu_device_handle dev, unsigned int fb_id, struct amdgpu_bo_import_result *output); + +/** + * Request GPU access to user allocated memory e.g. via "malloc" + * + * \param dev - [in] Device handle. See #amdgpu_device_initialize() + * \param cpu - [in] CPU address of user allocated memory which we + * want to map to GPU address space (make GPU accessible) + * (This address must be correctly aligned). + * \param size - [in] Size of allocation (must be correctly aligned) + * \param buf_handle - [out] Buffer handle for the userptr memory + * resource on submission and be used in other operations. + * + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \note + * This call doesn't guarantee that such memory will be persistently + * "locked" / make non-pageable. The purpose of this call is to provide + * opportunity for GPU get access to this resource during submission. + * + * The maximum amount of memory which could be mapped in this call depends + * if overcommit is disabled or not. If overcommit is disabled than the max. + * amount of memory to be pinned will be limited by left "free" size in total + * amount of memory which could be locked simultaneously ("GART" size). + * + * Supported (theoretical) max. size of mapping is restricted only by + * "GART" size. + * + * It is responsibility of caller to correctly specify access rights + * on VA assignment. +*/ +int amdgpu_create_bo_from_user_mem(amdgpu_device_handle dev, + void *cpu, uint64_t size, + amdgpu_bo_handle *buf_handle); + +/** + * Validate if the user memory comes from BO + * + * \param dev - [in] Device handle. See #amdgpu_device_initialize() + * \param cpu - [in] CPU address of user allocated memory which we + * want to map to GPU address space (make GPU accessible) + * (This address must be correctly aligned). + * \param size - [in] Size of allocation (must be correctly aligned) + * \param buf_handle - [out] Buffer handle for the userptr memory + * if the user memory is not from BO, the buf_handle will be NULL. + * \param offset_in_bo - [out] offset in this BO for this user memory + * + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_find_bo_by_cpu_mapping(amdgpu_device_handle dev, + void *cpu, + uint64_t size, + amdgpu_bo_handle *buf_handle, + uint64_t *offset_in_bo); + +/** + * Request GPU access to physical memory from 3rd party device. + * + * \param dev - [in] Device handle. See #amdgpu_device_initialize() + * \param phys_address - [in] Physical address from 3rd party device which + * we want to map to GPU address space (make GPU accessible) + * (This address must be correctly aligned). + * \param size - [in] Size of allocation (must be correctly aligned) + * \param buf_handle - [out] Buffer handle for the userptr memory + * resource on submission and be used in other operations. + * + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \note + * This call should guarantee that such memory will be persistently + * "locked" / make non-pageable. The purpose of this call is to provide + * opportunity for GPU get access to this resource during submission. + * + * + * Supported (theoretical) max. size of mapping is restricted only by + * capability.direct_gma_size. See #amdgpu_query_capability() + * + * It is responsibility of caller to correctly specify physical_address +*/ +int amdgpu_create_bo_from_phys_mem(amdgpu_device_handle dev, + uint64_t phys_address, uint64_t size, + amdgpu_bo_handle *buf_handle); + +/** + * Get physical address from BO + * + * \param buf_handle - [in] Buffer handle for the physical address. + * \param phys_address - [out] Physical address of this BO. + * + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_bo_get_phys_address(amdgpu_bo_handle buf_handle, + uint64_t *phys_address); + +/** + * Free previously allocated memory + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param buf_handle - \c [in] Buffer handle to free + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \note In the case of memory shared between different applications all + * resources will be “physically” freed only all such applications + * will be terminated + * \note If is UMD responsibility to ‘free’ buffer only when there is no + * more GPU access + * + * \sa amdgpu_bo_set_metadata(), amdgpu_bo_alloc() + * +*/ +int amdgpu_bo_free(amdgpu_bo_handle buf_handle); + +/** + * Increase the reference count of a buffer object + * + * \param bo - \c [in] Buffer object handle to increase the reference count + * + * \sa amdgpu_bo_alloc(), amdgpu_bo_free() + * +*/ +void amdgpu_bo_inc_ref(amdgpu_bo_handle bo); + +/** + * Request CPU access to GPU accessible memory + * + * \param buf_handle - \c [in] Buffer handle + * \param cpu - \c [out] CPU address to be used for access + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_bo_cpu_unmap() + * +*/ +int amdgpu_bo_cpu_map(amdgpu_bo_handle buf_handle, void **cpu); + +/** + * Release CPU access to GPU memory + * + * \param buf_handle - \c [in] Buffer handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_bo_cpu_map() + * +*/ +int amdgpu_bo_cpu_unmap(amdgpu_bo_handle buf_handle); + +/** + * Wait until a buffer is not used by the device. + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param buf_handle - \c [in] Buffer handle. + * \param timeout_ns - Timeout in nanoseconds. + * \param buffer_busy - 0 if buffer is idle, all GPU access was completed + * and no GPU access is scheduled. + * 1 GPU access is in fly or scheduled + * + * \return 0 - on success + * <0 - Negative POSIX Error code + */ +int amdgpu_bo_wait_for_idle(amdgpu_bo_handle buf_handle, + uint64_t timeout_ns, + bool *buffer_busy); + +/** + * Creates a BO list handle for command submission. + * + * \param dev - \c [in] Device handle. + * See #amdgpu_device_initialize() + * \param number_of_buffers - \c [in] Number of BOs in the list + * \param buffers - \c [in] List of BO handles + * \param result - \c [out] Created BO list handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_bo_list_destroy_raw(), amdgpu_cs_submit_raw2() +*/ +int amdgpu_bo_list_create_raw(amdgpu_device_handle dev, + uint32_t number_of_buffers, + struct drm_amdgpu_bo_list_entry *buffers, + uint32_t *result); + +/** + * Destroys a BO list handle. + * + * \param bo_list - \c [in] BO list handle. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_bo_list_create_raw(), amdgpu_cs_submit_raw2() +*/ +int amdgpu_bo_list_destroy_raw(amdgpu_device_handle dev, uint32_t bo_list); + +/** + * Creates a BO list handle for command submission. + * + * \param dev - \c [in] Device handle. + * See #amdgpu_device_initialize() + * \param number_of_resources - \c [in] Number of BOs in the list + * \param resources - \c [in] List of BO handles + * \param resource_prios - \c [in] Optional priority for each handle + * \param result - \c [out] Created BO list handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_bo_list_destroy() +*/ +int amdgpu_bo_list_create(amdgpu_device_handle dev, + uint32_t number_of_resources, + amdgpu_bo_handle *resources, + uint8_t *resource_prios, + amdgpu_bo_list_handle *result); + +/** + * Destroys a BO list handle. + * + * \param handle - \c [in] BO list handle. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_bo_list_create() +*/ +int amdgpu_bo_list_destroy(amdgpu_bo_list_handle handle); + +/** + * Update resources for existing BO list + * + * \param handle - \c [in] BO list handle + * \param number_of_resources - \c [in] Number of BOs in the list + * \param resources - \c [in] List of BO handles + * \param resource_prios - \c [in] Optional priority for each handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_bo_list_update() +*/ +int amdgpu_bo_list_update(amdgpu_bo_list_handle handle, + uint32_t number_of_resources, + amdgpu_bo_handle *resources, + uint8_t *resource_prios); + +/* + * GPU Execution context + * +*/ + +/** + * Create GPU execution Context + * + * For the purpose of GPU Scheduler and GPU Robustness extensions it is + * necessary to have information/identify rendering/compute contexts. + * It also may be needed to associate some specific requirements with such + * contexts. Kernel driver will guarantee that submission from the same + * context will always be executed in order (first come, first serve). + * + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param priority - \c [in] Context creation flags. See AMDGPU_CTX_PRIORITY_* + * \param context - \c [out] GPU Context handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_cs_ctx_free() + * +*/ +int amdgpu_cs_ctx_create2(amdgpu_device_handle dev, + uint32_t priority, + amdgpu_context_handle *context); +/** + * Create GPU execution Context + * + * Refer to amdgpu_cs_ctx_create2 for full documentation. This call + * is missing the priority parameter. + * + * \sa amdgpu_cs_ctx_create2() + * +*/ +int amdgpu_cs_ctx_create(amdgpu_device_handle dev, + amdgpu_context_handle *context); + +/** + * + * Destroy GPU execution context when not needed any more + * + * \param context - \c [in] GPU Context handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_cs_ctx_create() + * +*/ +int amdgpu_cs_ctx_free(amdgpu_context_handle context); + +/** + * Override the submission priority for the given context using a master fd. + * + * \param dev - \c [in] device handle + * \param context - \c [in] context handle for context id + * \param master_fd - \c [in] The master fd to authorize the override. + * \param priority - \c [in] The priority to assign to the context. + * + * \return 0 on success or a a negative Posix error code on failure. + */ +int amdgpu_cs_ctx_override_priority(amdgpu_device_handle dev, + amdgpu_context_handle context, + int master_fd, + unsigned priority); + +/** + * Set or query the stable power state for GPU profiling. + * + * \param dev - \c [in] device handle + * \param op - \c [in] AMDGPU_CTX_OP_{GET,SET}_STABLE_PSTATE + * \param flags - \c [in] AMDGPU_CTX_STABLE_PSTATE_* + * \param out_flags - \c [out] output current stable pstate + * + * \return 0 on success otherwise POSIX Error code. + */ +int amdgpu_cs_ctx_stable_pstate(amdgpu_context_handle context, + uint32_t op, + uint32_t flags, + uint32_t *out_flags); + +/** + * Query reset state for the specific GPU Context + * + * \param context - \c [in] GPU Context handle + * \param state - \c [out] One of AMDGPU_CTX_*_RESET + * \param hangs - \c [out] Number of hangs caused by the context. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_cs_ctx_create() + * +*/ +int amdgpu_cs_query_reset_state(amdgpu_context_handle context, + uint32_t *state, uint32_t *hangs); + +/** + * Query reset state for the specific GPU Context. + * + * \param context - \c [in] GPU Context handle + * \param flags - \c [out] A combination of AMDGPU_CTX_QUERY2_FLAGS_* + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_cs_ctx_create() + * +*/ +int amdgpu_cs_query_reset_state2(amdgpu_context_handle context, + uint64_t *flags); + +/* + * Command Buffers Management + * +*/ + +/** + * Send request to submit command buffers to hardware. + * + * Kernel driver could use GPU Scheduler to make decision when physically + * sent this request to the hardware. Accordingly this request could be put + * in queue and sent for execution later. The only guarantee is that request + * from the same GPU context to the same ip:ip_instance:ring will be executed in + * order. + * + * The caller can specify the user fence buffer/location with the fence_info in the + * cs_request.The sequence number is returned via the 'seq_no' parameter + * in ibs_request structure. + * + * + * \param dev - \c [in] Device handle. + * See #amdgpu_device_initialize() + * \param context - \c [in] GPU Context + * \param flags - \c [in] Global submission flags + * \param ibs_request - \c [in/out] Pointer to submission requests. + * We could submit to the several + * engines/rings simulteniously as + * 'atomic' operation + * \param number_of_requests - \c [in] Number of submission requests + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \note It is required to pass correct resource list with buffer handles + * which will be accessible by command buffers from submission + * This will allow kernel driver to correctly implement "paging". + * Failure to do so will have unpredictable results. + * + * \sa amdgpu_command_buffer_alloc(), amdgpu_command_buffer_free(), + * amdgpu_cs_query_fence_status() + * +*/ +int amdgpu_cs_submit(amdgpu_context_handle context, + uint64_t flags, + struct amdgpu_cs_request *ibs_request, + uint32_t number_of_requests); + +/** + * Query status of Command Buffer Submission + * + * \param fence - \c [in] Structure describing fence to query + * \param timeout_ns - \c [in] Timeout value to wait + * \param flags - \c [in] Flags for the query + * \param expired - \c [out] If fence expired or not.\n + * 0 – if fence is not expired\n + * !0 - otherwise + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \note If UMD wants only to check operation status and returned immediately + * then timeout value as 0 must be passed. In this case success will be + * returned in the case if submission was completed or timeout error + * code. + * + * \sa amdgpu_cs_submit() +*/ +int amdgpu_cs_query_fence_status(struct amdgpu_cs_fence *fence, + uint64_t timeout_ns, + uint64_t flags, + uint32_t *expired); + +/** + * Wait for multiple fences + * + * \param fences - \c [in] The fence array to wait + * \param fence_count - \c [in] The fence count + * \param wait_all - \c [in] If true, wait all fences to be signaled, + * otherwise, wait at least one fence + * \param timeout_ns - \c [in] The timeout to wait, in nanoseconds + * \param status - \c [out] '1' for signaled, '0' for timeout + * \param first - \c [out] the index of the first signaled fence from @fences + * + * \return 0 on success + * <0 - Negative POSIX Error code + * + * \note Currently it supports only one amdgpu_device. All fences come from + * the same amdgpu_device with the same fd. +*/ +int amdgpu_cs_wait_fences(struct amdgpu_cs_fence *fences, + uint32_t fence_count, + bool wait_all, + uint64_t timeout_ns, + uint32_t *status, uint32_t *first); + +/* + * Query / Info API + * +*/ + +/** + * Query allocation size alignments + * + * UMD should query information about GPU VM MC size alignments requirements + * to be able correctly choose required allocation size and implement + * internal optimization if needed. + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param info - \c [out] Pointer to structure to get size alignment + * requirements + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_query_buffer_size_alignment(amdgpu_device_handle dev, + struct amdgpu_buffer_size_alignments + *info); + +/** + * Query firmware versions + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param fw_type - \c [in] AMDGPU_INFO_FW_* + * \param ip_instance - \c [in] Index of the IP block of the same type. + * \param index - \c [in] Index of the engine. (for SDMA and MEC) + * \param version - \c [out] Pointer to to the "version" return value + * \param feature - \c [out] Pointer to to the "feature" return value + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_query_firmware_version(amdgpu_device_handle dev, unsigned fw_type, + unsigned ip_instance, unsigned index, + uint32_t *version, uint32_t *feature); + +/** + * Query the number of HW IP instances of a certain type. + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param type - \c [in] Hardware IP block type = AMDGPU_HW_IP_* + * \param count - \c [out] Pointer to structure to get information + * + * \return 0 on success\n + * <0 - Negative POSIX Error code +*/ +int amdgpu_query_hw_ip_count(amdgpu_device_handle dev, unsigned type, + uint32_t *count); + +/** + * Query engine information + * + * This query allows UMD to query information different engines and their + * capabilities. + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param type - \c [in] Hardware IP block type = AMDGPU_HW_IP_* + * \param ip_instance - \c [in] Index of the IP block of the same type. + * \param info - \c [out] Pointer to structure to get information + * + * \return 0 on success\n + * <0 - Negative POSIX Error code +*/ +int amdgpu_query_hw_ip_info(amdgpu_device_handle dev, unsigned type, + unsigned ip_instance, + struct drm_amdgpu_info_hw_ip *info); + +/** + * Query heap information + * + * This query allows UMD to query potentially available memory resources and + * adjust their logic if necessary. + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param heap - \c [in] Heap type + * \param info - \c [in] Pointer to structure to get needed information + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_query_heap_info(amdgpu_device_handle dev, uint32_t heap, + uint32_t flags, struct amdgpu_heap_info *info); + +/** + * Get the CRTC ID from the mode object ID + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param id - \c [in] Mode object ID + * \param result - \c [in] Pointer to the CRTC ID + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_query_crtc_from_id(amdgpu_device_handle dev, unsigned id, + int32_t *result); + +/** + * Query GPU H/w Info + * + * Query hardware specific information + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param heap - \c [in] Heap type + * \param info - \c [in] Pointer to structure to get needed information + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_query_gpu_info(amdgpu_device_handle dev, + struct amdgpu_gpu_info *info); + +/** + * Query hardware or driver information. + * + * The return size is query-specific and depends on the "info_id" parameter. + * No more than "size" bytes is returned. + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param info_id - \c [in] AMDGPU_INFO_* + * \param size - \c [in] Size of the returned value. + * \param value - \c [out] Pointer to the return value. + * + * \return 0 on success\n + * <0 - Negative POSIX error code + * +*/ +int amdgpu_query_info(amdgpu_device_handle dev, unsigned info_id, + unsigned size, void *value); + +/** + * Query hardware or driver capabilities. + * + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param value - \c [out] Pointer to the return value. + * + * \return 0 on success\n + * <0 - Negative POSIX error code + * +*/ +int amdgpu_query_capability(amdgpu_device_handle dev, + struct drm_amdgpu_capability *cap); + +/** + * Query hardware or driver information. + * + * The return size is query-specific and depends on the "info_id" parameter. + * No more than "size" bytes is returned. + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param info - \c [in] amdgpu_sw_info_* + * \param value - \c [out] Pointer to the return value. + * + * \return 0 on success\n + * <0 - Negative POSIX error code + * +*/ +int amdgpu_query_sw_info(amdgpu_device_handle dev, enum amdgpu_sw_info info, + void *value); + +/** + * Query information about GDS + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param gds_info - \c [out] Pointer to structure to get GDS information + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_query_gds_info(amdgpu_device_handle dev, + struct amdgpu_gds_resource_info *gds_info); + +/** + * Query information about sensor. + * + * The return size is query-specific and depends on the "sensor_type" + * parameter. No more than "size" bytes is returned. + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param sensor_type - \c [in] AMDGPU_INFO_SENSOR_* + * \param size - \c [in] Size of the returned value. + * \param value - \c [out] Pointer to the return value. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_query_sensor_info(amdgpu_device_handle dev, unsigned sensor_type, + unsigned size, void *value); + +/** + * Query information about video capabilities + * + * The return sizeof(struct drm_amdgpu_info_video_caps) + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param caps_type - \c [in] AMDGPU_INFO_VIDEO_CAPS_DECODE(ENCODE) + * \param size - \c [in] Size of the returned value. + * \param value - \c [out] Pointer to the return value. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_query_video_caps_info(amdgpu_device_handle dev, unsigned cap_type, + unsigned size, void *value); + +/** + * Query private aperture range + * + * \param dev - [in] Device handle. See #amdgpu_device_initialize() + * \param start - \c [out] Start of private aperture + * \param end - \c [out] End of private aperture + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_query_private_aperture(amdgpu_device_handle dev, + uint64_t *start, + uint64_t *end); + +/** + * Query shared aperture range + * + * \param dev - [in] Device handle. See #amdgpu_device_initialize() + * \param start - \c [out] Start of shared aperture + * \param end - \c [out] End of shared aperture + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_query_shared_aperture(amdgpu_device_handle dev, + uint64_t *start, + uint64_t *end); +/** + * Query information about VM faults + * + * The return sizeof(struct drm_amdgpu_info_gpuvm_fault) + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * \param size - \c [in] Size of the returned value. + * \param value - \c [out] Pointer to the return value. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_query_gpuvm_fault_info(amdgpu_device_handle dev, unsigned size, + void *value); + +/** + * Read a set of consecutive memory-mapped registers. + * Not all registers are allowed to be read by userspace. + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize( + * \param dword_offset - \c [in] Register offset in dwords + * \param count - \c [in] The number of registers to read starting + * from the offset + * \param instance - \c [in] GRBM_GFX_INDEX selector. It may have other + * uses. Set it to 0xffffffff if unsure. + * \param flags - \c [in] Flags with additional information. + * \param values - \c [out] The pointer to return values. + * + * \return 0 on success\n + * <0 - Negative POSIX error code + * +*/ +int amdgpu_read_mm_registers(amdgpu_device_handle dev, unsigned dword_offset, + unsigned count, uint32_t instance, uint32_t flags, + uint32_t *values); + +/** + * Flag to request VA address range in the 32bit address space +*/ +#define AMDGPU_VA_RANGE_32_BIT 0x1 +#define AMDGPU_VA_RANGE_HIGH 0x2 +#define AMDGPU_VA_RANGE_REPLAYABLE 0x4 + +/** + * Allocate virtual address range + * + * \param dev - [in] Device handle. See #amdgpu_device_initialize() + * \param va_range_type - \c [in] Type of MC va range from which to allocate + * \param size - \c [in] Size of range. Size must be correctly* aligned. + * It is client responsibility to correctly aligned size based on the future + * usage of allocated range. + * \param va_base_alignment - \c [in] Overwrite base address alignment + * requirement for GPU VM MC virtual + * address assignment. Must be multiple of size alignments received as + * 'amdgpu_buffer_size_alignments'. + * If 0 use the default one. + * \param va_base_required - \c [in] Specified required va base address. + * If 0 then library choose available one. + * If !0 value will be passed and those value already "in use" then + * corresponding error status will be returned. + * \param va_base_allocated - \c [out] On return: Allocated VA base to be used + * by client. + * \param va_range_handle - \c [out] On return: Handle assigned to allocation + * \param flags - \c [in] flags for special VA range + * + * \return 0 on success\n + * >0 - AMD specific error code\n + * <0 - Negative POSIX Error code + * + * \notes \n + * It is client responsibility to correctly handle VA assignments and usage. + * Neither kernel driver nor libdrm_amdpgu are able to prevent and + * detect wrong va assignment. + * + * It is client responsibility to correctly handle multi-GPU cases and to pass + * the corresponding arrays of all devices handles where corresponding VA will + * be used. + * +*/ +int amdgpu_va_range_alloc(amdgpu_device_handle dev, + enum amdgpu_gpu_va_range va_range_type, + uint64_t size, + uint64_t va_base_alignment, + uint64_t va_base_required, + uint64_t *va_base_allocated, + amdgpu_va_handle *va_range_handle, + uint64_t flags); + +/** + * Free previously allocated virtual address range + * + * + * \param va_range_handle - \c [in] Handle assigned to VA allocation + * + * \return 0 on success\n + * >0 - AMD specific error code\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_va_range_free(amdgpu_va_handle va_range_handle); + +/** + * Return the starting address of the allocated virtual address range. + */ +uint64_t amdgpu_va_get_start_addr(amdgpu_va_handle va_handle); + +/** +* Query virtual address range +* +* UMD can query GPU VM range supported by each device +* to initialize its own VAM accordingly. +* +* \param dev - [in] Device handle. See #amdgpu_device_initialize() +* \param type - \c [in] Type of virtual address range +* \param offset - \c [out] Start offset of virtual address range +* \param size - \c [out] Size of virtual address range +* +* \return 0 on success\n +* <0 - Negative POSIX Error code +* +*/ + +int amdgpu_va_range_query(amdgpu_device_handle dev, + enum amdgpu_gpu_va_range type, + uint64_t *start, + uint64_t *end); + +/** + * Allocate a amdgpu_va_manager object. + * The returned object has be initialized with the amdgpu_va_manager_init + * before use. + * On release, amdgpu_va_manager_deinit needs to be called, then the memory + * can be released using free(). + */ +amdgpu_va_manager_handle amdgpu_va_manager_alloc(void); + +void amdgpu_va_manager_init(amdgpu_va_manager_handle va_mgr, + uint64_t low_va_offset, uint64_t low_va_max, + uint64_t high_va_offset, uint64_t high_va_max, + uint32_t virtual_address_alignment); + +void amdgpu_va_manager_deinit(amdgpu_va_manager_handle va_mgr); + +/** + * Similar to #amdgpu_va_range_alloc() but allocates VA + * directly from an amdgpu_va_manager_handle instead of using + * the manager from an amdgpu_device. + */ + +int amdgpu_va_range_alloc2(amdgpu_va_manager_handle va_mgr, + enum amdgpu_gpu_va_range va_range_type, + uint64_t size, + uint64_t va_base_alignment, + uint64_t va_base_required, + uint64_t *va_base_allocated, + amdgpu_va_handle *va_range_handle, + uint64_t flags); + +/** + * VA mapping/unmapping for the buffer object + * + * \param bo - \c [in] BO handle + * \param offset - \c [in] Start offset to map + * \param size - \c [in] Size to map + * \param addr - \c [in] Start virtual address. + * \param flags - \c [in] Supported flags for mapping/unmapping + * \param ops - \c [in] AMDGPU_VA_OP_MAP or AMDGPU_VA_OP_UNMAP + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ + +int amdgpu_bo_va_op(amdgpu_bo_handle bo, + uint64_t offset, + uint64_t size, + uint64_t addr, + uint64_t flags, + uint32_t ops); + +/** + * VA mapping/unmapping for a buffer object or PRT region. + * + * This is not a simple drop-in extension for amdgpu_bo_va_op; instead, all + * parameters are treated "raw", i.e. size is not automatically aligned, and + * all flags must be specified explicitly. + * + * \param dev - \c [in] device handle + * \param bo - \c [in] BO handle (may be NULL) + * \param offset - \c [in] Start offset to map + * \param size - \c [in] Size to map + * \param addr - \c [in] Start virtual address. + * \param flags - \c [in] Supported flags for mapping/unmapping + * \param ops - \c [in] AMDGPU_VA_OP_MAP or AMDGPU_VA_OP_UNMAP + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ + +int amdgpu_bo_va_op_raw(amdgpu_device_handle dev, + amdgpu_bo_handle bo, + uint64_t offset, + uint64_t size, + uint64_t addr, + uint64_t flags, + uint32_t ops); + +/** + * create semaphore + * + * \param sem - \c [out] semaphore handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_create_semaphore(amdgpu_semaphore_handle *sem); + +/** + * signal semaphore + * + * \param context - \c [in] GPU Context + * \param ip_type - \c [in] Hardware IP block type = AMDGPU_HW_IP_* + * \param ip_instance - \c [in] Index of the IP block of the same type + * \param ring - \c [in] Specify ring index of the IP + * \param sem - \c [in] semaphore handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_signal_semaphore(amdgpu_context_handle ctx, + uint32_t ip_type, + uint32_t ip_instance, + uint32_t ring, + amdgpu_semaphore_handle sem); + +/** + * wait semaphore + * + * \param context - \c [in] GPU Context + * \param ip_type - \c [in] Hardware IP block type = AMDGPU_HW_IP_* + * \param ip_instance - \c [in] Index of the IP block of the same type + * \param ring - \c [in] Specify ring index of the IP + * \param sem - \c [in] semaphore handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_wait_semaphore(amdgpu_context_handle ctx, + uint32_t ip_type, + uint32_t ip_instance, + uint32_t ring, + amdgpu_semaphore_handle sem); + +/** + * destroy semaphore + * + * \param sem - \c [in] semaphore handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_destroy_semaphore(amdgpu_semaphore_handle sem); + +/** + * create sem + * + * \param dev - [in] Device handle. See #amdgpu_device_initialize() + * \param sem - \c [out] sem handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_create_sem(amdgpu_device_handle dev, + amdgpu_sem_handle *sem); + +/** + * signal sem + * + * \param dev - [in] Device handle. See #amdgpu_device_initialize() + * \param context - \c [in] GPU Context + * \param ip_type - \c [in] Hardware IP block type = AMDGPU_HW_IP_* + * \param ip_instance - \c [in] Index of the IP block of the same type + * \param ring - \c [in] Specify ring index of the IP + * \param sem - \c [out] sem handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + */ +int amdgpu_cs_signal_sem(amdgpu_device_handle dev, + amdgpu_context_handle ctx, + uint32_t ip_type, + uint32_t ip_instance, + uint32_t ring, + amdgpu_sem_handle sem); + +/** + * wait sem + * + * \param dev - [in] Device handle. See #amdgpu_device_initialize() + * \param context - \c [in] GPU Context + * \param ip_type - \c [in] Hardware IP block type = AMDGPU_HW_IP_* + * \param ip_instance - \c [in] Index of the IP block of the same type + * \param ring - \c [in] Specify ring index of the IP + * \param sem - \c [out] sem handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_wait_sem(amdgpu_device_handle dev, + amdgpu_context_handle ctx, + uint32_t ip_type, + uint32_t ip_instance, + uint32_t ring, + amdgpu_sem_handle sem); + +int amdgpu_cs_export_sem(amdgpu_device_handle dev, + amdgpu_sem_handle sem, + int *shared_handle); + +int amdgpu_cs_import_sem(amdgpu_device_handle dev, + int shared_handle, + amdgpu_sem_handle *sem); + +/** + * destroy sem + * + * \param dev - [in] Device handle. See #amdgpu_device_initialize() + * \param sem - \c [out] sem handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + */ +int amdgpu_cs_destroy_sem(amdgpu_device_handle dev, + amdgpu_sem_handle sem); + +/** + * reserve vmid for this process + * + * \param dev - [in] Device handle. See #amdgpu_device_initialize() + */ +int amdgpu_cs_reserved_vmid(amdgpu_device_handle dev); + +/** + * unreserve vmid for this process + * + * \param dev - [in] Device handle. See #amdgpu_device_initialize() + */ +int amdgpu_cs_unreserved_vmid(amdgpu_device_handle dev); + +/** + * Get the ASIC marketing name + * + * \param dev - \c [in] Device handle. See #amdgpu_device_initialize() + * + * \return the constant string of the marketing name + * "NULL" means the ASIC is not found +*/ +const char *amdgpu_get_marketing_name(amdgpu_device_handle dev); + +/** + * Create kernel sync object + * + * \param dev - \c [in] device handle + * \param flags - \c [in] flags that affect creation + * \param syncobj - \c [out] sync object handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_create_syncobj2(amdgpu_device_handle dev, + uint32_t flags, + uint32_t *syncobj); + +/** + * Create kernel sync object + * + * \param dev - \c [in] device handle + * \param syncobj - \c [out] sync object handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_create_syncobj(amdgpu_device_handle dev, + uint32_t *syncobj); +/** + * Destroy kernel sync object + * + * \param dev - \c [in] device handle + * \param syncobj - \c [in] sync object handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_destroy_syncobj(amdgpu_device_handle dev, + uint32_t syncobj); + +/** + * Reset kernel sync objects to unsignalled state. + * + * \param dev - \c [in] device handle + * \param syncobjs - \c [in] array of sync object handles + * \param syncobj_count - \c [in] number of handles in syncobjs + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_syncobj_reset(amdgpu_device_handle dev, + const uint32_t *syncobjs, uint32_t syncobj_count); + +/** + * Signal kernel sync objects. + * + * \param dev - \c [in] device handle + * \param syncobjs - \c [in] array of sync object handles + * \param syncobj_count - \c [in] number of handles in syncobjs + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_syncobj_signal(amdgpu_device_handle dev, + const uint32_t *syncobjs, uint32_t syncobj_count); + +/** + * Signal kernel timeline sync objects. + * + * \param dev - \c [in] device handle + * \param syncobjs - \c [in] array of sync object handles + * \param points - \c [in] array of timeline points + * \param syncobj_count - \c [in] number of handles in syncobjs + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_syncobj_timeline_signal(amdgpu_device_handle dev, + const uint32_t *syncobjs, + uint64_t *points, + uint32_t syncobj_count); + +/** + * Wait for one or all sync objects to signal. + * + * \param dev - \c [in] self-explanatory + * \param handles - \c [in] array of sync object handles + * \param num_handles - \c [in] self-explanatory + * \param timeout_nsec - \c [in] self-explanatory + * \param flags - \c [in] a bitmask of DRM_SYNCOBJ_WAIT_FLAGS_* + * \param first_signaled - \c [in] self-explanatory + * + * \return 0 on success\n + * -ETIME - Timeout + * <0 - Negative POSIX Error code + * + */ +int amdgpu_cs_syncobj_wait(amdgpu_device_handle dev, + uint32_t *handles, unsigned num_handles, + int64_t timeout_nsec, unsigned flags, + uint32_t *first_signaled); + +/** + * Wait for one or all sync objects on their points to signal. + * + * \param dev - \c [in] self-explanatory + * \param handles - \c [in] array of sync object handles + * \param points - \c [in] array of sync points to wait + * \param num_handles - \c [in] self-explanatory + * \param timeout_nsec - \c [in] self-explanatory + * \param flags - \c [in] a bitmask of DRM_SYNCOBJ_WAIT_FLAGS_* + * \param first_signaled - \c [in] self-explanatory + * + * \return 0 on success\n + * -ETIME - Timeout + * <0 - Negative POSIX Error code + * + */ +int amdgpu_cs_syncobj_timeline_wait(amdgpu_device_handle dev, + uint32_t *handles, uint64_t *points, + unsigned num_handles, + int64_t timeout_nsec, unsigned flags, + uint32_t *first_signaled); +/** + * Query sync objects payloads. + * + * \param dev - \c [in] self-explanatory + * \param handles - \c [in] array of sync object handles + * \param points - \c [out] array of sync points returned, which presents + * syncobj payload. + * \param num_handles - \c [in] self-explanatory + * + * \return 0 on success\n + * -ETIME - Timeout + * <0 - Negative POSIX Error code + * + */ +int amdgpu_cs_syncobj_query(amdgpu_device_handle dev, + uint32_t *handles, uint64_t *points, + unsigned num_handles); +/** + * Query sync objects last signaled or submitted point. + * + * \param dev - \c [in] self-explanatory + * \param handles - \c [in] array of sync object handles + * \param points - \c [out] array of sync points returned, which presents + * syncobj payload. + * \param num_handles - \c [in] self-explanatory + * \param flags - \c [in] a bitmask of DRM_SYNCOBJ_QUERY_FLAGS_* + * + * \return 0 on success\n + * -ETIME - Timeout + * <0 - Negative POSIX Error code + * + */ +int amdgpu_cs_syncobj_query2(amdgpu_device_handle dev, + uint32_t *handles, uint64_t *points, + unsigned num_handles, uint32_t flags); + +/** + * Export kernel sync object to shareable fd. + * + * \param dev - \c [in] device handle + * \param syncobj - \c [in] sync object handle + * \param shared_fd - \c [out] shared file descriptor. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_export_syncobj(amdgpu_device_handle dev, + uint32_t syncobj, + int *shared_fd); +/** + * Import kernel sync object from shareable fd. + * + * \param dev - \c [in] device handle + * \param shared_fd - \c [in] shared file descriptor. + * \param syncobj - \c [out] sync object handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * +*/ +int amdgpu_cs_import_syncobj(amdgpu_device_handle dev, + int shared_fd, + uint32_t *syncobj); + +/** + * Export kernel sync object to a sync_file. + * + * \param dev - \c [in] device handle + * \param syncobj - \c [in] sync object handle + * \param sync_file_fd - \c [out] sync_file file descriptor. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + */ +int amdgpu_cs_syncobj_export_sync_file(amdgpu_device_handle dev, + uint32_t syncobj, + int *sync_file_fd); + +/** + * Import kernel sync object from a sync_file. + * + * \param dev - \c [in] device handle + * \param syncobj - \c [in] sync object handle + * \param sync_file_fd - \c [in] sync_file file descriptor. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + */ +int amdgpu_cs_syncobj_import_sync_file(amdgpu_device_handle dev, + uint32_t syncobj, + int sync_file_fd); +/** + * Export kernel timeline sync object to a sync_file. + * + * \param dev - \c [in] device handle + * \param syncobj - \c [in] sync object handle + * \param point - \c [in] timeline point + * \param flags - \c [in] flags + * \param sync_file_fd - \c [out] sync_file file descriptor. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + */ +int amdgpu_cs_syncobj_export_sync_file2(amdgpu_device_handle dev, + uint32_t syncobj, + uint64_t point, + uint32_t flags, + int *sync_file_fd); + +/** + * Import kernel timeline sync object from a sync_file. + * + * \param dev - \c [in] device handle + * \param syncobj - \c [in] sync object handle + * \param point - \c [in] timeline point + * \param sync_file_fd - \c [in] sync_file file descriptor. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + */ +int amdgpu_cs_syncobj_import_sync_file2(amdgpu_device_handle dev, + uint32_t syncobj, + uint64_t point, + int sync_file_fd); + +/** + * transfer between syncbojs. + * + * \param dev - \c [in] device handle + * \param dst_handle - \c [in] sync object handle + * \param dst_point - \c [in] timeline point, 0 presents dst is binary + * \param src_handle - \c [in] sync object handle + * \param src_point - \c [in] timeline point, 0 presents src is binary + * \param flags - \c [in] flags + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + */ +int amdgpu_cs_syncobj_transfer(amdgpu_device_handle dev, + uint32_t dst_handle, + uint64_t dst_point, + uint32_t src_handle, + uint64_t src_point, + uint32_t flags); + +/** + * Export an amdgpu fence as a handle (syncobj or fd). + * + * \param what AMDGPU_FENCE_TO_HANDLE_GET_{SYNCOBJ, FD} + * \param out_handle returned handle + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + */ +int amdgpu_cs_fence_to_handle(amdgpu_device_handle dev, + struct amdgpu_cs_fence *fence, + uint32_t what, + uint32_t *out_handle); + +/** + * Submit raw command submission to kernel + * + * \param dev - \c [in] device handle + * \param context - \c [in] context handle for context id + * \param bo_list_handle - \c [in] request bo list handle (0 for none) + * \param num_chunks - \c [in] number of CS chunks to submit + * \param chunks - \c [in] array of CS chunks + * \param seq_no - \c [out] output sequence number for submission. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + */ +struct drm_amdgpu_cs_chunk; +struct drm_amdgpu_cs_chunk_dep; +struct drm_amdgpu_cs_chunk_data; + +int amdgpu_cs_submit_raw(amdgpu_device_handle dev, + amdgpu_context_handle context, + amdgpu_bo_list_handle bo_list_handle, + int num_chunks, + struct drm_amdgpu_cs_chunk *chunks, + uint64_t *seq_no); + +/** + * Submit raw command submission to the kernel with a raw BO list handle. + * + * \param dev - \c [in] device handle + * \param context - \c [in] context handle for context id + * \param bo_list_handle - \c [in] raw bo list handle (0 for none) + * \param num_chunks - \c [in] number of CS chunks to submit + * \param chunks - \c [in] array of CS chunks + * \param seq_no - \c [out] output sequence number for submission. + * + * \return 0 on success\n + * <0 - Negative POSIX Error code + * + * \sa amdgpu_bo_list_create_raw(), amdgpu_bo_list_destroy_raw() + */ +int amdgpu_cs_submit_raw2(amdgpu_device_handle dev, + amdgpu_context_handle context, + uint32_t bo_list_handle, + int num_chunks, + struct drm_amdgpu_cs_chunk *chunks, + uint64_t *seq_no); + +void amdgpu_cs_chunk_fence_to_dep(struct amdgpu_cs_fence *fence, + struct drm_amdgpu_cs_chunk_dep *dep); +void amdgpu_cs_chunk_fence_info_to_data(struct amdgpu_cs_fence_info *fence_info, + struct drm_amdgpu_cs_chunk_data *data); + +/** + * Reserve VMID + * \param context - \c [in] GPU Context + * \param flags - \c [in] TBD + * + * \return 0 on success otherwise POSIX Error code +*/ +int amdgpu_vm_reserve_vmid(amdgpu_device_handle dev, uint32_t flags); + +/** + * Free reserved VMID + * \param context - \c [in] GPU Context + * \param flags - \c [in] TBD + * + * \return 0 on success otherwise POSIX Error code +*/ +int amdgpu_vm_unreserve_vmid(amdgpu_device_handle dev, uint32_t flags); + +#ifdef __cplusplus +} +#endif +#endif /* #ifdef _AMDGPU_H_ */ diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/amdgpu_drm.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/amdgpu_drm.h new file mode 100644 index 0000000000..9c595f392a --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/amdgpu_drm.h @@ -0,0 +1,1382 @@ +/* amdgpu_drm.h -- Public header for the amdgpu driver -*- linux-c -*- + * + * Copyright 2000 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Fremont, California. + * Copyright 2002 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Kevin E. Martin + * Gareth Hughes + * Keith Whitwell + */ + +#ifndef __AMDGPU_DRM_H__ +#define __AMDGPU_DRM_H__ + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_AMDGPU_GEM_CREATE 0x00 +#define DRM_AMDGPU_GEM_MMAP 0x01 +#define DRM_AMDGPU_CTX 0x02 +#define DRM_AMDGPU_BO_LIST 0x03 +#define DRM_AMDGPU_CS 0x04 +#define DRM_AMDGPU_INFO 0x05 +#define DRM_AMDGPU_GEM_METADATA 0x06 +#define DRM_AMDGPU_GEM_WAIT_IDLE 0x07 +#define DRM_AMDGPU_GEM_VA 0x08 +#define DRM_AMDGPU_WAIT_CS 0x09 +#define DRM_AMDGPU_GEM_OP 0x10 +#define DRM_AMDGPU_GEM_USERPTR 0x11 +#define DRM_AMDGPU_WAIT_FENCES 0x12 +#define DRM_AMDGPU_VM 0x13 +#define DRM_AMDGPU_FENCE_TO_HANDLE 0x14 +#define DRM_AMDGPU_SCHED 0x15 + +/* hybrid specific ioctls */ +#define DRM_AMDGPU_SEM 0x5b +#define DRM_AMDGPU_GEM_DGMA 0x5c + +#define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) +#define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) +#define DRM_IOCTL_AMDGPU_CTX DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CTX, union drm_amdgpu_ctx) +#define DRM_IOCTL_AMDGPU_BO_LIST DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_BO_LIST, union drm_amdgpu_bo_list) +#define DRM_IOCTL_AMDGPU_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CS, union drm_amdgpu_cs) +#define DRM_IOCTL_AMDGPU_INFO DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_INFO, struct drm_amdgpu_info) +#define DRM_IOCTL_AMDGPU_GEM_METADATA DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_METADATA, struct drm_amdgpu_gem_metadata) +#define DRM_IOCTL_AMDGPU_GEM_WAIT_IDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_WAIT_IDLE, union drm_amdgpu_gem_wait_idle) +#define DRM_IOCTL_AMDGPU_GEM_VA DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_VA, struct drm_amdgpu_gem_va) +#define DRM_IOCTL_AMDGPU_WAIT_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_CS, union drm_amdgpu_wait_cs) +#define DRM_IOCTL_AMDGPU_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_OP, struct drm_amdgpu_gem_op) +#define DRM_IOCTL_AMDGPU_GEM_USERPTR DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_USERPTR, struct drm_amdgpu_gem_userptr) +#define DRM_IOCTL_AMDGPU_WAIT_FENCES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_FENCES, union drm_amdgpu_wait_fences) +#define DRM_IOCTL_AMDGPU_VM DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_VM, union drm_amdgpu_vm) +#define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_FENCE_TO_HANDLE, union drm_amdgpu_fence_to_handle) +#define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_SCHED, union drm_amdgpu_sched) +/* hybrid specific ioctls */ +#define DRM_IOCTL_AMDGPU_SEM DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_SEM, union drm_amdgpu_sem) +#define DRM_IOCTL_AMDGPU_GEM_DGMA DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_DGMA, struct drm_amdgpu_gem_dgma) + +/** + * DOC: memory domains + * + * %AMDGPU_GEM_DOMAIN_CPU System memory that is not GPU accessible. + * Memory in this pool could be swapped out to disk if there is pressure. + * + * %AMDGPU_GEM_DOMAIN_GTT GPU accessible system memory, mapped into the + * GPU's virtual address space via gart. Gart memory linearizes non-contiguous + * pages of system memory, allows GPU access system memory in a linearized + * fashion. + * + * %AMDGPU_GEM_DOMAIN_VRAM Local video memory. For APUs, it is memory + * carved out by the BIOS. + * + * %AMDGPU_GEM_DOMAIN_GDS Global on-chip data storage used to share data + * across shader threads. + * + * %AMDGPU_GEM_DOMAIN_GWS Global wave sync, used to synchronize the + * execution of all the waves on a device. + * + * %AMDGPU_GEM_DOMAIN_OA Ordered append, used by 3D or Compute engines + * for appending data. + * + * %AMDGPU_GEM_DOMAIN_DOORBELL Doorbell. It is an MMIO region for + * signalling user mode queues. + */ +#define AMDGPU_GEM_DOMAIN_CPU 0x1 +#define AMDGPU_GEM_DOMAIN_GTT 0x2 +#define AMDGPU_GEM_DOMAIN_VRAM 0x4 +#define AMDGPU_GEM_DOMAIN_GDS 0x8 +#define AMDGPU_GEM_DOMAIN_GWS 0x10 +#define AMDGPU_GEM_DOMAIN_OA 0x20 +#define AMDGPU_GEM_DOMAIN_DOORBELL 0x40 +#define AMDGPU_GEM_DOMAIN_DGMA 0x400 +#define AMDGPU_GEM_DOMAIN_MASK (AMDGPU_GEM_DOMAIN_CPU | \ + AMDGPU_GEM_DOMAIN_GTT | \ + AMDGPU_GEM_DOMAIN_VRAM | \ + AMDGPU_GEM_DOMAIN_GDS | \ + AMDGPU_GEM_DOMAIN_GWS | \ + AMDGPU_GEM_DOMAIN_OA | \ + AMDGPU_GEM_DOMAIN_DOORBELL | \ + AMDGPU_GEM_DOMAIN_DGMA) + +/* Flag that CPU access will be required for the case of VRAM domain */ +#define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED (1 << 0) +/* Flag that CPU access will not work, this VRAM domain is invisible */ +#define AMDGPU_GEM_CREATE_NO_CPU_ACCESS (1 << 1) +/* Flag that USWC attributes should be used for GTT */ +#define AMDGPU_GEM_CREATE_CPU_GTT_USWC (1 << 2) +/* Flag that the memory should be in VRAM and cleared */ +#define AMDGPU_GEM_CREATE_VRAM_CLEARED (1 << 3) +/* Flag that allocating the BO should use linear VRAM */ +#define AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS (1 << 5) +/* Flag that BO is always valid in this VM */ +#define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6) +/* Flag that BO sharing will be explicitly synchronized */ +#define AMDGPU_GEM_CREATE_EXPLICIT_SYNC (1 << 7) +/* Flag that indicates allocating MQD gart on GFX9, where the mtype + * for the second page onward should be set to NC. It should never + * be used by user space applications. + */ +#define AMDGPU_GEM_CREATE_CP_MQD_GFX9 (1 << 8) +/* Flag that BO may contain sensitive data that must be wiped before + * releasing the memory + */ +#define AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE (1 << 9) +/* Flag that BO will be encrypted and that the TMZ bit should be + * set in the PTEs when mapping this buffer via GPUVM or + * accessing it with various hw blocks + */ +#define AMDGPU_GEM_CREATE_ENCRYPTED (1 << 10) +/* Flag that BO will be used only in preemptible context, which does + * not require GTT memory accounting + */ +#define AMDGPU_GEM_CREATE_PREEMPTIBLE (1 << 11) +/* Flag that BO can be discarded under memory pressure without keeping the + * content. + */ +#define AMDGPU_GEM_CREATE_DISCARDABLE (1 << 12) +/* Flag that BO is shared coherently between multiple devices or CPU threads. + * May depend on GPU instructions to flush caches to system scope explicitly. + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_COHERENT (1 << 13) +/* Flag that BO should not be cached by GPU. Coherent without having to flush + * GPU caches explicitly + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_UNCACHED (1 << 14) +/* Flag that BO should be coherent across devices when using device-level + * atomics. May depend on GPU instructions to flush caches to device scope + * explicitly, promoting them to system scope automatically. + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_EXT_COHERENT (1 << 15) + +/* Hybrid specific */ +/* Flag that the memory allocation should be from top of domain */ +#define AMDGPU_GEM_CREATE_TOP_DOWN (1ULL << 30) +/* Flag that the memory allocation should be pinned */ +#define AMDGPU_GEM_CREATE_NO_EVICT (1ULL << 31) + +struct drm_amdgpu_gem_create_in { + /** the requested memory size */ + __u64 bo_size; + /** physical start_addr alignment in bytes for some HW requirements */ + __u64 alignment; + /** the requested memory domains */ + __u64 domains; + /** allocation flags */ + __u64 domain_flags; +}; + +struct drm_amdgpu_gem_create_out { + /** returned GEM object handle */ + __u32 handle; + __u32 _pad; +}; + +union drm_amdgpu_gem_create { + struct drm_amdgpu_gem_create_in in; + struct drm_amdgpu_gem_create_out out; +}; + +/** Opcode to create new residency list. */ +#define AMDGPU_BO_LIST_OP_CREATE 0 +/** Opcode to destroy previously created residency list */ +#define AMDGPU_BO_LIST_OP_DESTROY 1 +/** Opcode to update resource information in the list */ +#define AMDGPU_BO_LIST_OP_UPDATE 2 + +struct drm_amdgpu_bo_list_in { + /** Type of operation */ + __u32 operation; + /** Handle of list or 0 if we want to create one */ + __u32 list_handle; + /** Number of BOs in list */ + __u32 bo_number; + /** Size of each element describing BO */ + __u32 bo_info_size; + /** Pointer to array describing BOs */ + __u64 bo_info_ptr; +}; + +struct drm_amdgpu_bo_list_entry { + /** Handle of BO */ + __u32 bo_handle; + /** New (if specified) BO priority to be used during migration */ + __u32 bo_priority; +}; + +struct drm_amdgpu_bo_list_out { + /** Handle of resource list */ + __u32 list_handle; + __u32 _pad; +}; + +union drm_amdgpu_bo_list { + struct drm_amdgpu_bo_list_in in; + struct drm_amdgpu_bo_list_out out; +}; + +/* context related */ +#define AMDGPU_CTX_OP_ALLOC_CTX 1 +#define AMDGPU_CTX_OP_FREE_CTX 2 +#define AMDGPU_CTX_OP_QUERY_STATE 3 +#define AMDGPU_CTX_OP_QUERY_STATE2 4 +#define AMDGPU_CTX_OP_GET_STABLE_PSTATE 5 +#define AMDGPU_CTX_OP_SET_STABLE_PSTATE 6 + +/* GPU reset status */ +#define AMDGPU_CTX_NO_RESET 0 +/* this the context caused it */ +#define AMDGPU_CTX_GUILTY_RESET 1 +/* some other context caused it */ +#define AMDGPU_CTX_INNOCENT_RESET 2 +/* unknown cause */ +#define AMDGPU_CTX_UNKNOWN_RESET 3 + +/* indicate gpu reset occurred after ctx created */ +#define AMDGPU_CTX_QUERY2_FLAGS_RESET (1<<0) +/* indicate vram lost occurred after ctx created */ +#define AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST (1<<1) +/* indicate some job from this context once cause gpu hang */ +#define AMDGPU_CTX_QUERY2_FLAGS_GUILTY (1<<2) +/* indicate some errors are detected by RAS */ +#define AMDGPU_CTX_QUERY2_FLAGS_RAS_CE (1<<3) +#define AMDGPU_CTX_QUERY2_FLAGS_RAS_UE (1<<4) +/* indicate that the reset hasn't completed yet */ +#define AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS (1<<5) + +/* Context priority level */ +#define AMDGPU_CTX_PRIORITY_UNSET -2048 +#define AMDGPU_CTX_PRIORITY_VERY_LOW -1023 +#define AMDGPU_CTX_PRIORITY_LOW -512 +#define AMDGPU_CTX_PRIORITY_NORMAL 0 +/* + * When used in struct drm_amdgpu_ctx_in, a priority above NORMAL requires + * CAP_SYS_NICE or DRM_MASTER +*/ +#define AMDGPU_CTX_PRIORITY_HIGH 512 +#define AMDGPU_CTX_PRIORITY_VERY_HIGH 1023 + +/* select a stable profiling pstate for perfmon tools */ +#define AMDGPU_CTX_STABLE_PSTATE_FLAGS_MASK 0xf +#define AMDGPU_CTX_STABLE_PSTATE_NONE 0 +#define AMDGPU_CTX_STABLE_PSTATE_STANDARD 1 +#define AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK 2 +#define AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK 3 +#define AMDGPU_CTX_STABLE_PSTATE_PEAK 4 + +struct drm_amdgpu_ctx_in { + /** AMDGPU_CTX_OP_* */ + __u32 op; + /** Flags */ + __u32 flags; + __u32 ctx_id; + /** AMDGPU_CTX_PRIORITY_* */ + __s32 priority; +}; + +union drm_amdgpu_ctx_out { + struct { + __u32 ctx_id; + __u32 _pad; + } alloc; + + struct { + /** For future use, no flags defined so far */ + __u64 flags; + /** Number of resets caused by this context so far. */ + __u32 hangs; + /** Reset status since the last call of the ioctl. */ + __u32 reset_status; + } state; + + struct { + __u32 flags; + __u32 _pad; + } pstate; +}; + +union drm_amdgpu_ctx { + struct drm_amdgpu_ctx_in in; + union drm_amdgpu_ctx_out out; +}; + +/* sem related */ +#define AMDGPU_SEM_OP_CREATE_SEM 1 +#define AMDGPU_SEM_OP_WAIT_SEM 2 +#define AMDGPU_SEM_OP_SIGNAL_SEM 3 +#define AMDGPU_SEM_OP_DESTROY_SEM 4 +#define AMDGPU_SEM_OP_IMPORT_SEM 5 +#define AMDGPU_SEM_OP_EXPORT_SEM 6 + +struct drm_amdgpu_sem_in { + /** AMDGPU_SEM_OP_* */ + uint32_t op; + uint32_t handle; + uint32_t ctx_id; + uint32_t ip_type; + uint32_t ip_instance; + uint32_t ring; + uint64_t seq; +}; + +union drm_amdgpu_sem_out { + int fd; + uint32_t handle; +}; + +union drm_amdgpu_sem { + struct drm_amdgpu_sem_in in; + union drm_amdgpu_sem_out out; +}; + +/* vm ioctl */ +#define AMDGPU_VM_OP_RESERVE_VMID 1 +#define AMDGPU_VM_OP_UNRESERVE_VMID 2 + +struct drm_amdgpu_vm_in { + /** AMDGPU_VM_OP_* */ + __u32 op; + __u32 flags; +}; + +struct drm_amdgpu_vm_out { + /** For future use, no flags defined so far */ + __u64 flags; +}; + +union drm_amdgpu_vm { + struct drm_amdgpu_vm_in in; + struct drm_amdgpu_vm_out out; +}; + +/* sched ioctl */ +#define AMDGPU_SCHED_OP_PROCESS_PRIORITY_OVERRIDE 1 +#define AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE 2 + +struct drm_amdgpu_sched_in { + /* AMDGPU_SCHED_OP_* */ + __u32 op; + __u32 fd; + /** AMDGPU_CTX_PRIORITY_* */ + __s32 priority; + __u32 ctx_id; +}; + +union drm_amdgpu_sched { + struct drm_amdgpu_sched_in in; +}; + +/* + * This is not a reliable API and you should expect it to fail for any + * number of reasons and have fallback path that do not use userptr to + * perform any operation. + */ +#define AMDGPU_GEM_USERPTR_READONLY (1 << 0) +#define AMDGPU_GEM_USERPTR_ANONONLY (1 << 1) +#define AMDGPU_GEM_USERPTR_VALIDATE (1 << 2) +#define AMDGPU_GEM_USERPTR_REGISTER (1 << 3) + +struct drm_amdgpu_gem_userptr { + __u64 addr; + __u64 size; + /* AMDGPU_GEM_USERPTR_* */ + __u32 flags; + /* Resulting GEM handle */ + __u32 handle; +}; + +#define AMDGPU_GEM_DGMA_IMPORT 0 +#define AMDGPU_GEM_DGMA_QUERY_PHYS_ADDR 1 +struct drm_amdgpu_gem_dgma { + uint64_t addr; + uint64_t size; + uint32_t op; + uint32_t handle; +}; + +/* SI-CI-VI: */ +/* same meaning as the GB_TILE_MODE and GL_MACRO_TILE_MODE fields */ +#define AMDGPU_TILING_ARRAY_MODE_SHIFT 0 +#define AMDGPU_TILING_ARRAY_MODE_MASK 0xf +#define AMDGPU_TILING_PIPE_CONFIG_SHIFT 4 +#define AMDGPU_TILING_PIPE_CONFIG_MASK 0x1f +#define AMDGPU_TILING_TILE_SPLIT_SHIFT 9 +#define AMDGPU_TILING_TILE_SPLIT_MASK 0x7 +#define AMDGPU_TILING_MICRO_TILE_MODE_SHIFT 12 +#define AMDGPU_TILING_MICRO_TILE_MODE_MASK 0x7 +#define AMDGPU_TILING_BANK_WIDTH_SHIFT 15 +#define AMDGPU_TILING_BANK_WIDTH_MASK 0x3 +#define AMDGPU_TILING_BANK_HEIGHT_SHIFT 17 +#define AMDGPU_TILING_BANK_HEIGHT_MASK 0x3 +#define AMDGPU_TILING_MACRO_TILE_ASPECT_SHIFT 19 +#define AMDGPU_TILING_MACRO_TILE_ASPECT_MASK 0x3 +#define AMDGPU_TILING_NUM_BANKS_SHIFT 21 +#define AMDGPU_TILING_NUM_BANKS_MASK 0x3 + +/* GFX9 - GFX11: */ +#define AMDGPU_TILING_SWIZZLE_MODE_SHIFT 0 +#define AMDGPU_TILING_SWIZZLE_MODE_MASK 0x1f +#define AMDGPU_TILING_DCC_OFFSET_256B_SHIFT 5 +#define AMDGPU_TILING_DCC_OFFSET_256B_MASK 0xFFFFFF +#define AMDGPU_TILING_DCC_PITCH_MAX_SHIFT 29 +#define AMDGPU_TILING_DCC_PITCH_MAX_MASK 0x3FFF +#define AMDGPU_TILING_DCC_INDEPENDENT_64B_SHIFT 43 +#define AMDGPU_TILING_DCC_INDEPENDENT_64B_MASK 0x1 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_SHIFT 44 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_MASK 0x1 +#define AMDGPU_TILING_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_SCANOUT_MASK 0x1 + +/* GFX12 and later: */ +#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_SHIFT 0 +#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_MASK 0x7 +/* These are DCC recompression setting for memory management: */ +#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_SHIFT 3 +#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_MASK 0x3 /* 0:64B, 1:128B, 2:256B */ +#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_SHIFT 5 +#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_MASK 0x7 /* CB_COLOR0_INFO.NUMBER_TYPE */ +#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_SHIFT 8 +#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_MASK 0x3f /* [0:4]:CB_COLOR0_INFO.FORMAT, [5]:MM */ + +/* Set/Get helpers for tiling flags. */ +#define AMDGPU_TILING_SET(field, value) \ + (((__u64)(value) & AMDGPU_TILING_##field##_MASK) << AMDGPU_TILING_##field##_SHIFT) +#define AMDGPU_TILING_GET(value, field) \ + (((__u64)(value) >> AMDGPU_TILING_##field##_SHIFT) & AMDGPU_TILING_##field##_MASK) + +#define AMDGPU_GEM_METADATA_OP_SET_METADATA 1 +#define AMDGPU_GEM_METADATA_OP_GET_METADATA 2 + +/** The same structure is shared for input/output */ +struct drm_amdgpu_gem_metadata { + /** GEM Object handle */ + __u32 handle; + /** Do we want get or set metadata */ + __u32 op; + struct { + /** For future use, no flags defined so far */ + __u64 flags; + /** family specific tiling info */ + __u64 tiling_info; + __u32 data_size_bytes; + __u32 data[64]; + } data; +}; + +struct drm_amdgpu_gem_mmap_in { + /** the GEM object handle */ + __u32 handle; + __u32 _pad; +}; + +struct drm_amdgpu_gem_mmap_out { + /** mmap offset from the vma offset manager */ + __u64 addr_ptr; +}; + +union drm_amdgpu_gem_mmap { + struct drm_amdgpu_gem_mmap_in in; + struct drm_amdgpu_gem_mmap_out out; +}; + +struct drm_amdgpu_gem_wait_idle_in { + /** GEM object handle */ + __u32 handle; + /** For future use, no flags defined so far */ + __u32 flags; + /** Absolute timeout to wait */ + __u64 timeout; +}; + +struct drm_amdgpu_gem_wait_idle_out { + /** BO status: 0 - BO is idle, 1 - BO is busy */ + __u32 status; + /** Returned current memory domain */ + __u32 domain; +}; + +union drm_amdgpu_gem_wait_idle { + struct drm_amdgpu_gem_wait_idle_in in; + struct drm_amdgpu_gem_wait_idle_out out; +}; + +struct drm_amdgpu_wait_cs_in { + /* Command submission handle + * handle equals 0 means none to wait for + * handle equals ~0ull means wait for the latest sequence number + */ + __u64 handle; + /** Absolute timeout to wait */ + __u64 timeout; + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u32 ctx_id; +}; + +struct drm_amdgpu_wait_cs_out { + /** CS status: 0 - CS completed, 1 - CS still busy */ + __u64 status; +}; + +union drm_amdgpu_wait_cs { + struct drm_amdgpu_wait_cs_in in; + struct drm_amdgpu_wait_cs_out out; +}; + +struct drm_amdgpu_fence { + __u32 ctx_id; + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u64 seq_no; +}; + +struct drm_amdgpu_wait_fences_in { + /** This points to uint64_t * which points to fences */ + __u64 fences; + __u32 fence_count; + __u32 wait_all; + __u64 timeout_ns; +}; + +struct drm_amdgpu_wait_fences_out { + __u32 status; + __u32 first_signaled; +}; + +union drm_amdgpu_wait_fences { + struct drm_amdgpu_wait_fences_in in; + struct drm_amdgpu_wait_fences_out out; +}; + +#define AMDGPU_GEM_OP_GET_GEM_CREATE_INFO 0 +#define AMDGPU_GEM_OP_SET_PLACEMENT 1 + +/* Sets or returns a value associated with a buffer. */ +struct drm_amdgpu_gem_op { + /** GEM object handle */ + __u32 handle; + /** AMDGPU_GEM_OP_* */ + __u32 op; + /** Input or return value */ + __u64 value; +}; + +#define AMDGPU_VA_OP_MAP 1 +#define AMDGPU_VA_OP_UNMAP 2 +#define AMDGPU_VA_OP_CLEAR 3 +#define AMDGPU_VA_OP_REPLACE 4 + +/* Delay the page table update till the next CS */ +#define AMDGPU_VM_DELAY_UPDATE (1 << 0) + +/* Mapping flags */ +/* readable mapping */ +#define AMDGPU_VM_PAGE_READABLE (1 << 1) +/* writable mapping */ +#define AMDGPU_VM_PAGE_WRITEABLE (1 << 2) +/* executable mapping, new for VI */ +#define AMDGPU_VM_PAGE_EXECUTABLE (1 << 3) +/* partially resident texture */ +#define AMDGPU_VM_PAGE_PRT (1 << 4) +/* MTYPE flags use bit 5 to 8 */ +#define AMDGPU_VM_MTYPE_MASK (0xf << 5) +/* Default MTYPE. Pre-AI must use this. Recommended for newer ASICs. */ +#define AMDGPU_VM_MTYPE_DEFAULT (0 << 5) +/* Use Non Coherent MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_NC (1 << 5) +/* Use Write Combine MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_WC (2 << 5) +/* Use Cache Coherent MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_CC (3 << 5) +/* Use UnCached MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_UC (4 << 5) +/* Use Read Write MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_RW (5 << 5) +/* don't allocate MALL */ +#define AMDGPU_VM_PAGE_NOALLOC (1 << 9) + +struct drm_amdgpu_gem_va { + /** GEM object handle */ + __u32 handle; + __u32 _pad; + /** AMDGPU_VA_OP_* */ + __u32 operation; + /** AMDGPU_VM_PAGE_* */ + __u32 flags; + /** va address to assign . Must be correctly aligned.*/ + __u64 va_address; + /** Specify offset inside of BO to assign. Must be correctly aligned.*/ + __u64 offset_in_bo; + /** Specify mapping size. Must be correctly aligned. */ + __u64 map_size; +}; + +#define AMDGPU_HW_IP_GFX 0 +#define AMDGPU_HW_IP_COMPUTE 1 +#define AMDGPU_HW_IP_DMA 2 +#define AMDGPU_HW_IP_UVD 3 +#define AMDGPU_HW_IP_VCE 4 +#define AMDGPU_HW_IP_UVD_ENC 5 +#define AMDGPU_HW_IP_VCN_DEC 6 +/* + * From VCN4, AMDGPU_HW_IP_VCN_ENC is re-used to support + * both encoding and decoding jobs. + */ +#define AMDGPU_HW_IP_VCN_ENC 7 +#define AMDGPU_HW_IP_VCN_JPEG 8 +#define AMDGPU_HW_IP_VPE 9 +#define AMDGPU_HW_IP_NUM 10 + +#define AMDGPU_HW_IP_INSTANCE_MAX_COUNT 1 + +#define AMDGPU_CHUNK_ID_IB 0x01 +#define AMDGPU_CHUNK_ID_FENCE 0x02 +#define AMDGPU_CHUNK_ID_DEPENDENCIES 0x03 +#define AMDGPU_CHUNK_ID_SYNCOBJ_IN 0x04 +#define AMDGPU_CHUNK_ID_SYNCOBJ_OUT 0x05 +#define AMDGPU_CHUNK_ID_BO_HANDLES 0x06 +#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07 +#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT 0x08 +#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL 0x09 +#define AMDGPU_CHUNK_ID_CP_GFX_SHADOW 0x0a + +struct drm_amdgpu_cs_chunk { + __u32 chunk_id; + __u32 length_dw; + __u64 chunk_data; +}; + +struct drm_amdgpu_cs_in { + /** Rendering context id */ + __u32 ctx_id; + /** Handle of resource list associated with CS */ + __u32 bo_list_handle; + __u32 num_chunks; + __u32 flags; + /** this points to __u64 * which point to cs chunks */ + __u64 chunks; +}; + +struct drm_amdgpu_cs_out { + __u64 handle; +}; + +union drm_amdgpu_cs { + struct drm_amdgpu_cs_in in; + struct drm_amdgpu_cs_out out; +}; + +/* Specify flags to be used for IB */ + +/* This IB should be submitted to CE */ +#define AMDGPU_IB_FLAG_CE (1<<0) + +/* Preamble flag, which means the IB could be dropped if no context switch */ +#define AMDGPU_IB_FLAG_PREAMBLE (1<<1) + +/* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */ +#define AMDGPU_IB_FLAG_PREEMPT (1<<2) + +/* The IB fence should do the L2 writeback but not invalidate any shader + * caches (L2/vL1/sL1/I$). */ +#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) + +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER. + * This will reset wave ID counters for the IB. + */ +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) + +/* Flag the IB as secure (TMZ) + */ +#define AMDGPU_IB_FLAGS_SECURE (1 << 5) + +/* Tell KMD to flush and invalidate caches + */ +#define AMDGPU_IB_FLAG_EMIT_MEM_SYNC (1 << 6) + +struct drm_amdgpu_cs_chunk_ib { + __u32 _pad; + /** AMDGPU_IB_FLAG_* */ + __u32 flags; + /** Virtual address to begin IB execution */ + __u64 va_start; + /** Size of submission */ + __u32 ib_bytes; + /** HW IP to submit to */ + __u32 ip_type; + /** HW IP index of the same type to submit to */ + __u32 ip_instance; + /** Ring index to submit to */ + __u32 ring; +}; + +struct drm_amdgpu_cs_chunk_dep { + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u32 ctx_id; + __u64 handle; +}; + +struct drm_amdgpu_cs_chunk_fence { + __u32 handle; + __u32 offset; +}; + +struct drm_amdgpu_cs_chunk_sem { + __u32 handle; +}; + +struct drm_amdgpu_cs_chunk_syncobj { + __u32 handle; + __u32 flags; + __u64 point; +}; + +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ 0 +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD 1 +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD 2 + +union drm_amdgpu_fence_to_handle { + struct { + struct drm_amdgpu_fence fence; + __u32 what; + __u32 pad; + } in; + struct { + __u32 handle; + } out; +}; + +struct drm_amdgpu_cs_chunk_data { + union { + struct drm_amdgpu_cs_chunk_ib ib_data; + struct drm_amdgpu_cs_chunk_fence fence_data; + }; +}; + +#define AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW 0x1 + +struct drm_amdgpu_cs_chunk_cp_gfx_shadow { + __u64 shadow_va; + __u64 csa_va; + __u64 gds_va; + __u64 flags; +}; + +/* + * Query h/w info: Flag that this is integrated (a.h.a. fusion) GPU + * + */ +#define AMDGPU_IDS_FLAGS_FUSION 0x1 +#define AMDGPU_IDS_FLAGS_PREEMPTION 0x2 +#define AMDGPU_IDS_FLAGS_TMZ 0x4 +#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x8 + +/* + * Query h/w info: Flag identifying VF/PF/PT mode + * + */ +#define AMDGPU_IDS_FLAGS_MODE_MASK 0x300 +#define AMDGPU_IDS_FLAGS_MODE_SHIFT 0x8 +#define AMDGPU_IDS_FLAGS_MODE_PF 0x0 +#define AMDGPU_IDS_FLAGS_MODE_VF 0x1 +#define AMDGPU_IDS_FLAGS_MODE_PT 0x2 + +/* indicate if acceleration can be working */ +#define AMDGPU_INFO_ACCEL_WORKING 0x00 +/* get the crtc_id from the mode object id? */ +#define AMDGPU_INFO_CRTC_FROM_ID 0x01 +/* query hw IP info */ +#define AMDGPU_INFO_HW_IP_INFO 0x02 +/* query hw IP instance count for the specified type */ +#define AMDGPU_INFO_HW_IP_COUNT 0x03 +/* timestamp for GL_ARB_timer_query */ +#define AMDGPU_INFO_TIMESTAMP 0x05 +/* Query the firmware version */ +#define AMDGPU_INFO_FW_VERSION 0x0e + /* Subquery id: Query VCE firmware version */ + #define AMDGPU_INFO_FW_VCE 0x1 + /* Subquery id: Query UVD firmware version */ + #define AMDGPU_INFO_FW_UVD 0x2 + /* Subquery id: Query GMC firmware version */ + #define AMDGPU_INFO_FW_GMC 0x03 + /* Subquery id: Query GFX ME firmware version */ + #define AMDGPU_INFO_FW_GFX_ME 0x04 + /* Subquery id: Query GFX PFP firmware version */ + #define AMDGPU_INFO_FW_GFX_PFP 0x05 + /* Subquery id: Query GFX CE firmware version */ + #define AMDGPU_INFO_FW_GFX_CE 0x06 + /* Subquery id: Query GFX RLC firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC 0x07 + /* Subquery id: Query GFX MEC firmware version */ + #define AMDGPU_INFO_FW_GFX_MEC 0x08 + /* Subquery id: Query SMC firmware version */ + #define AMDGPU_INFO_FW_SMC 0x0a + /* Subquery id: Query SDMA firmware version */ + #define AMDGPU_INFO_FW_SDMA 0x0b + /* Subquery id: Query PSP SOS firmware version */ + #define AMDGPU_INFO_FW_SOS 0x0c + /* Subquery id: Query PSP ASD firmware version */ + #define AMDGPU_INFO_FW_ASD 0x0d + /* Subquery id: Query VCN firmware version */ + #define AMDGPU_INFO_FW_VCN 0x0e + /* Subquery id: Query GFX RLC SRLC firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_CNTL 0x0f + /* Subquery id: Query GFX RLC SRLG firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_GPM_MEM 0x10 + /* Subquery id: Query GFX RLC SRLS firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_SRM_MEM 0x11 + /* Subquery id: Query DMCU firmware version */ + #define AMDGPU_INFO_FW_DMCU 0x12 + #define AMDGPU_INFO_FW_TA 0x13 + /* Subquery id: Query DMCUB firmware version */ + #define AMDGPU_INFO_FW_DMCUB 0x14 + /* Subquery id: Query TOC firmware version */ + #define AMDGPU_INFO_FW_TOC 0x15 + /* Subquery id: Query CAP firmware version */ + #define AMDGPU_INFO_FW_CAP 0x16 + /* Subquery id: Query GFX RLCP firmware version */ + #define AMDGPU_INFO_FW_GFX_RLCP 0x17 + /* Subquery id: Query GFX RLCV firmware version */ + #define AMDGPU_INFO_FW_GFX_RLCV 0x18 + /* Subquery id: Query MES_KIQ firmware version */ + #define AMDGPU_INFO_FW_MES_KIQ 0x19 + /* Subquery id: Query MES firmware version */ + #define AMDGPU_INFO_FW_MES 0x1a + /* Subquery id: Query IMU firmware version */ + #define AMDGPU_INFO_FW_IMU 0x1b + /* Subquery id: Query VPE firmware version */ + #define AMDGPU_INFO_FW_VPE 0x1c + +/* number of bytes moved for TTM migration */ +#define AMDGPU_INFO_NUM_BYTES_MOVED 0x0f +/* the used VRAM size */ +#define AMDGPU_INFO_VRAM_USAGE 0x10 +/* the used GTT size */ +#define AMDGPU_INFO_GTT_USAGE 0x11 +/* Information about GDS, etc. resource configuration */ +#define AMDGPU_INFO_GDS_CONFIG 0x13 +/* Query information about VRAM and GTT domains */ +#define AMDGPU_INFO_VRAM_GTT 0x14 +/* Query information about register in MMR address space*/ +#define AMDGPU_INFO_READ_MMR_REG 0x15 +/* Query information about device: rev id, family, etc. */ +#define AMDGPU_INFO_DEV_INFO 0x16 +/* visible vram usage */ +#define AMDGPU_INFO_VIS_VRAM_USAGE 0x17 +/* number of TTM buffer evictions */ +#define AMDGPU_INFO_NUM_EVICTIONS 0x18 +/* Query memory about VRAM and GTT domains */ +#define AMDGPU_INFO_MEMORY 0x19 +/* Query vce clock table */ +#define AMDGPU_INFO_VCE_CLOCK_TABLE 0x1A +/* Query vbios related information */ +#define AMDGPU_INFO_VBIOS 0x1B + /* Subquery id: Query vbios size */ + #define AMDGPU_INFO_VBIOS_SIZE 0x1 + /* Subquery id: Query vbios image */ + #define AMDGPU_INFO_VBIOS_IMAGE 0x2 + /* Subquery id: Query vbios info */ + #define AMDGPU_INFO_VBIOS_INFO 0x3 +/* Query UVD handles */ +#define AMDGPU_INFO_NUM_HANDLES 0x1C +/* Query sensor related information */ +#define AMDGPU_INFO_SENSOR 0x1D + /* Subquery id: Query GPU shader clock */ + #define AMDGPU_INFO_SENSOR_GFX_SCLK 0x1 + /* Subquery id: Query GPU memory clock */ + #define AMDGPU_INFO_SENSOR_GFX_MCLK 0x2 + /* Subquery id: Query GPU temperature */ + #define AMDGPU_INFO_SENSOR_GPU_TEMP 0x3 + /* Subquery id: Query GPU load */ + #define AMDGPU_INFO_SENSOR_GPU_LOAD 0x4 + /* Subquery id: Query average GPU power */ + #define AMDGPU_INFO_SENSOR_GPU_AVG_POWER 0x5 + /* Subquery id: Query northbridge voltage */ + #define AMDGPU_INFO_SENSOR_VDDNB 0x6 + /* Subquery id: Query graphics voltage */ + #define AMDGPU_INFO_SENSOR_VDDGFX 0x7 + /* Subquery id: Query GPU stable pstate shader clock */ + #define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_SCLK 0x8 + /* Subquery id: Query GPU stable pstate memory clock */ + #define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_MCLK 0x9 + /* Subquery id: Query GPU peak pstate shader clock */ + #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_SCLK 0xa + /* Subquery id: Query GPU peak pstate memory clock */ + #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_MCLK 0xb + /* Subquery id: Query input GPU power */ + #define AMDGPU_INFO_SENSOR_GPU_INPUT_POWER 0xc +/* Number of VRAM page faults on CPU access. */ +#define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS 0x1E +#define AMDGPU_INFO_VRAM_LOST_COUNTER 0x1F +/* query ras mask of enabled features*/ +#define AMDGPU_INFO_RAS_ENABLED_FEATURES 0x20 +/* RAS MASK: UMC (VRAM) */ +#define AMDGPU_INFO_RAS_ENABLED_UMC (1 << 0) +/* RAS MASK: SDMA */ +#define AMDGPU_INFO_RAS_ENABLED_SDMA (1 << 1) +/* RAS MASK: GFX */ +#define AMDGPU_INFO_RAS_ENABLED_GFX (1 << 2) +/* RAS MASK: MMHUB */ +#define AMDGPU_INFO_RAS_ENABLED_MMHUB (1 << 3) +/* RAS MASK: ATHUB */ +#define AMDGPU_INFO_RAS_ENABLED_ATHUB (1 << 4) +/* RAS MASK: PCIE */ +#define AMDGPU_INFO_RAS_ENABLED_PCIE (1 << 5) +/* RAS MASK: HDP */ +#define AMDGPU_INFO_RAS_ENABLED_HDP (1 << 6) +/* RAS MASK: XGMI */ +#define AMDGPU_INFO_RAS_ENABLED_XGMI (1 << 7) +/* RAS MASK: DF */ +#define AMDGPU_INFO_RAS_ENABLED_DF (1 << 8) +/* RAS MASK: SMN */ +#define AMDGPU_INFO_RAS_ENABLED_SMN (1 << 9) +/* RAS MASK: SEM */ +#define AMDGPU_INFO_RAS_ENABLED_SEM (1 << 10) +/* RAS MASK: MP0 */ +#define AMDGPU_INFO_RAS_ENABLED_MP0 (1 << 11) +/* RAS MASK: MP1 */ +#define AMDGPU_INFO_RAS_ENABLED_MP1 (1 << 12) +/* RAS MASK: FUSE */ +#define AMDGPU_INFO_RAS_ENABLED_FUSE (1 << 13) +/* query video encode/decode caps */ +#define AMDGPU_INFO_VIDEO_CAPS 0x21 + /* Subquery id: Decode */ + #define AMDGPU_INFO_VIDEO_CAPS_DECODE 0 + /* Subquery id: Encode */ + #define AMDGPU_INFO_VIDEO_CAPS_ENCODE 1 +/* Query the max number of IBs per gang per submission */ +#define AMDGPU_INFO_MAX_IBS 0x22 +/* query last page fault info */ +#define AMDGPU_INFO_GPUVM_FAULT 0x23 + +/* gpu capability */ +#define AMDGPU_INFO_CAPABILITY 0x50 +/* virtual range */ +#define AMDGPU_INFO_VIRTUAL_RANGE 0x51 + +#define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0 +#define AMDGPU_INFO_MMR_SE_INDEX_MASK 0xff +#define AMDGPU_INFO_MMR_SH_INDEX_SHIFT 8 +#define AMDGPU_INFO_MMR_SH_INDEX_MASK 0xff + +struct drm_amdgpu_query_fw { + /** AMDGPU_INFO_FW_* */ + __u32 fw_type; + /** + * Index of the IP if there are more IPs of + * the same type. + */ + __u32 ip_instance; + /** + * Index of the engine. Whether this is used depends + * on the firmware type. (e.g. MEC, SDMA) + */ + __u32 index; + __u32 _pad; +}; + +/* Input structure for the INFO ioctl */ +struct drm_amdgpu_info { + /* Where the return value will be stored */ + __u64 return_pointer; + /* The size of the return value. Just like "size" in "snprintf", + * it limits how many bytes the kernel can write. */ + __u32 return_size; + /* The query request id. */ + __u32 query; + + union { + struct { + __u32 id; + __u32 _pad; + } mode_crtc; + + struct { + /** AMDGPU_HW_IP_* */ + __u32 type; + /** + * Index of the IP if there are more IPs of the same + * type. Ignored by AMDGPU_INFO_HW_IP_COUNT. + */ + __u32 ip_instance; + } query_hw_ip; + + struct { + __u32 dword_offset; + /** number of registers to read */ + __u32 count; + __u32 instance; + /** For future use, no flags defined so far */ + __u32 flags; + } read_mmr_reg; + + struct { + uint32_t aperture; + uint32_t _pad; + } virtual_range; + + struct drm_amdgpu_query_fw query_fw; + + struct { + __u32 type; + __u32 offset; + } vbios_info; + + struct { + __u32 type; + } sensor_info; + + struct { + __u32 type; + } video_cap; + }; +}; + +struct drm_amdgpu_info_gds { + /** GDS GFX partition size */ + __u32 gds_gfx_partition_size; + /** GDS compute partition size */ + __u32 compute_partition_size; + /** total GDS memory size */ + __u32 gds_total_size; + /** GWS size per GFX partition */ + __u32 gws_per_gfx_partition; + /** GSW size per compute partition */ + __u32 gws_per_compute_partition; + /** OA size per GFX partition */ + __u32 oa_per_gfx_partition; + /** OA size per compute partition */ + __u32 oa_per_compute_partition; + __u32 _pad; +}; + +struct drm_amdgpu_info_vram_gtt { + __u64 vram_size; + __u64 vram_cpu_accessible_size; + __u64 gtt_size; +}; + +struct drm_amdgpu_heap_info { + /** max. physical memory */ + __u64 total_heap_size; + + /** Theoretical max. available memory in the given heap */ + __u64 usable_heap_size; + + /** + * Number of bytes allocated in the heap. This includes all processes + * and private allocations in the kernel. It changes when new buffers + * are allocated, freed, and moved. It cannot be larger than + * heap_size. + */ + __u64 heap_usage; + + /** + * Theoretical possible max. size of buffer which + * could be allocated in the given heap + */ + __u64 max_allocation; +}; + +struct drm_amdgpu_memory_info { + struct drm_amdgpu_heap_info vram; + struct drm_amdgpu_heap_info cpu_accessible_vram; + struct drm_amdgpu_heap_info gtt; +}; + +struct drm_amdgpu_info_firmware { + __u32 ver; + __u32 feature; +}; + +struct drm_amdgpu_info_vbios { + __u8 name[64]; + __u8 vbios_pn[64]; + __u32 version; + __u32 pad; + __u8 vbios_ver_str[32]; + __u8 date[32]; +}; + +#define AMDGPU_VRAM_TYPE_UNKNOWN 0 +#define AMDGPU_VRAM_TYPE_GDDR1 1 +#define AMDGPU_VRAM_TYPE_DDR2 2 +#define AMDGPU_VRAM_TYPE_GDDR3 3 +#define AMDGPU_VRAM_TYPE_GDDR4 4 +#define AMDGPU_VRAM_TYPE_GDDR5 5 +#define AMDGPU_VRAM_TYPE_HBM 6 +#define AMDGPU_VRAM_TYPE_DDR3 7 +#define AMDGPU_VRAM_TYPE_DDR4 8 +#define AMDGPU_VRAM_TYPE_GDDR6 9 +#define AMDGPU_VRAM_TYPE_DDR5 10 +#define AMDGPU_VRAM_TYPE_LPDDR4 11 +#define AMDGPU_VRAM_TYPE_LPDDR5 12 + +struct drm_amdgpu_info_device { + /** PCI Device ID */ + __u32 device_id; + /** Internal chip revision: A0, A1, etc.) */ + __u32 chip_rev; + __u32 external_rev; + /** Revision id in PCI Config space */ + __u32 pci_rev; + __u32 family; + __u32 num_shader_engines; + __u32 num_shader_arrays_per_engine; + /* in KHz */ + __u32 gpu_counter_freq; + __u64 max_engine_clock; + __u64 max_memory_clock; + /* cu information */ + __u32 cu_active_number; + /* NOTE: cu_ao_mask is INVALID, DON'T use it */ + __u32 cu_ao_mask; + __u32 cu_bitmap[4][4]; + /** Render backend pipe mask. One render backend is CB+DB. */ + __u32 enabled_rb_pipes_mask; + __u32 num_rb_pipes; + __u32 num_hw_gfx_contexts; + /* PCIe version (the smaller of the GPU and the CPU/motherboard) */ + __u32 pcie_gen; + __u64 ids_flags; + /** Starting virtual address for UMDs. */ + __u64 virtual_address_offset; + /** The maximum virtual address */ + __u64 virtual_address_max; + /** Required alignment of virtual addresses. */ + __u32 virtual_address_alignment; + /** Page table entry - fragment size */ + __u32 pte_fragment_size; + __u32 gart_page_size; + /** constant engine ram size*/ + __u32 ce_ram_size; + /** video memory type info*/ + __u32 vram_type; + /** video memory bit width*/ + __u32 vram_bit_width; + /* vce harvesting instance */ + __u32 vce_harvest_config; + /* gfx double offchip LDS buffers */ + __u32 gc_double_offchip_lds_buf; + /* NGG Primitive Buffer */ + __u64 prim_buf_gpu_addr; + /* NGG Position Buffer */ + __u64 pos_buf_gpu_addr; + /* NGG Control Sideband */ + __u64 cntl_sb_buf_gpu_addr; + /* NGG Parameter Cache */ + __u64 param_buf_gpu_addr; + __u32 prim_buf_size; + __u32 pos_buf_size; + __u32 cntl_sb_buf_size; + __u32 param_buf_size; + /* wavefront size*/ + __u32 wave_front_size; + /* shader visible vgprs*/ + __u32 num_shader_visible_vgprs; + /* CU per shader array*/ + __u32 num_cu_per_sh; + /* number of tcc blocks*/ + __u32 num_tcc_blocks; + /* gs vgt table depth*/ + __u32 gs_vgt_table_depth; + /* gs primitive buffer depth*/ + __u32 gs_prim_buffer_depth; + /* max gs wavefront per vgt*/ + __u32 max_gs_waves_per_vgt; + /* PCIe number of lanes (the smaller of the GPU and the CPU/motherboard) */ + __u32 pcie_num_lanes; + /* always on cu bitmap */ + __u32 cu_ao_bitmap[4][4]; + /** Starting high virtual address for UMDs. */ + __u64 high_va_offset; + /** The maximum high virtual address */ + __u64 high_va_max; + /* gfx10 pa_sc_tile_steering_override */ + __u32 pa_sc_tile_steering_override; + /* disabled TCCs */ + __u64 tcc_disabled_mask; + __u64 min_engine_clock; + __u64 min_memory_clock; + /* The following fields are only set on gfx11+, older chips set 0. */ + __u32 tcp_cache_size; /* AKA GL0, VMEM cache */ + __u32 num_sqc_per_wgp; + __u32 sqc_data_cache_size; /* AKA SMEM cache */ + __u32 sqc_inst_cache_size; + __u32 gl1c_cache_size; + __u32 gl2c_cache_size; + __u64 mall_size; /* AKA infinity cache */ + /* high 32 bits of the rb pipes mask */ + __u32 enabled_rb_pipes_mask_hi; + /* shadow area size for gfx11 */ + __u32 shadow_size; + /* shadow area base virtual alignment for gfx11 */ + __u32 shadow_alignment; + /* context save area size for gfx11 */ + __u32 csa_size; + /* context save area base virtual alignment for gfx11 */ + __u32 csa_alignment; +}; + +struct drm_amdgpu_info_hw_ip { + /** Version of h/w IP */ + __u32 hw_ip_version_major; + __u32 hw_ip_version_minor; + /** Capabilities */ + __u64 capabilities_flags; + /** command buffer address start alignment*/ + __u32 ib_start_alignment; + /** command buffer size alignment*/ + __u32 ib_size_alignment; + /** Bitmask of available rings. Bit 0 means ring 0, etc. */ + __u32 available_rings; + /** version info: bits 23:16 major, 15:8 minor, 7:0 revision */ + __u32 ip_discovery_version; +}; + +struct drm_amdgpu_info_num_handles { + /** Max handles as supported by firmware for UVD */ + __u32 uvd_max_handles; + /** Handles currently in use for UVD */ + __u32 uvd_used_handles; +}; + +#define AMDGPU_VCE_CLOCK_TABLE_ENTRIES 6 + +struct drm_amdgpu_info_vce_clock_table_entry { + /** System clock */ + __u32 sclk; + /** Memory clock */ + __u32 mclk; + /** VCE clock */ + __u32 eclk; + __u32 pad; +}; + +struct drm_amdgpu_info_vce_clock_table { + struct drm_amdgpu_info_vce_clock_table_entry entries[AMDGPU_VCE_CLOCK_TABLE_ENTRIES]; + __u32 num_valid_entries; + __u32 pad; +}; + +/* query video encode/decode caps */ +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2 0 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4 1 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1 2 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC 3 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC 4 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG 5 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9 6 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1 7 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT 8 + +struct drm_amdgpu_info_video_codec_info { + __u32 valid; + __u32 max_width; + __u32 max_height; + __u32 max_pixels_per_frame; + __u32 max_level; + __u32 pad; +}; + +struct drm_amdgpu_info_video_caps { + struct drm_amdgpu_info_video_codec_info codec_info[AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT]; +}; + +#define AMDGPU_VMHUB_TYPE_MASK 0xff +#define AMDGPU_VMHUB_TYPE_SHIFT 0 +#define AMDGPU_VMHUB_TYPE_GFX 0 +#define AMDGPU_VMHUB_TYPE_MM0 1 +#define AMDGPU_VMHUB_TYPE_MM1 2 +#define AMDGPU_VMHUB_IDX_MASK 0xff00 +#define AMDGPU_VMHUB_IDX_SHIFT 8 + +struct drm_amdgpu_info_gpuvm_fault { + __u64 addr; + __u32 status; + __u32 vmhub; +}; + +/* + * Supported GPU families + */ +#define AMDGPU_FAMILY_UNKNOWN 0 +#define AMDGPU_FAMILY_SI 110 /* Hainan, Oland, Verde, Pitcairn, Tahiti */ +#define AMDGPU_FAMILY_CI 120 /* Bonaire, Hawaii */ +#define AMDGPU_FAMILY_KV 125 /* Kaveri, Kabini, Mullins */ +#define AMDGPU_FAMILY_VI 130 /* Iceland, Tonga */ +#define AMDGPU_FAMILY_CZ 135 /* Carrizo, Stoney */ +#define AMDGPU_FAMILY_AI 141 /* Vega10 */ +#define AMDGPU_FAMILY_RV 142 /* Raven */ +#define AMDGPU_FAMILY_NV 143 /* Navi10 */ +#define AMDGPU_FAMILY_VGH 144 /* Van Gogh */ +#define AMDGPU_FAMILY_GC_11_0_0 145 /* GC 11.0.0 */ +#define AMDGPU_FAMILY_YC 146 /* Yellow Carp */ +#define AMDGPU_FAMILY_GC_11_0_1 148 /* GC 11.0.1 */ +#define AMDGPU_FAMILY_GC_10_3_6 149 /* GC 10.3.6 */ +#define AMDGPU_FAMILY_GC_10_3_7 151 /* GC 10.3.7 */ +#define AMDGPU_FAMILY_GC_11_5_0 150 /* GC 11.5.0 */ +#define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ + +/** + * Definition of System Unified Address (SUA) apertures + */ +#define AMDGPU_SUA_APERTURE_PRIVATE 1 +#define AMDGPU_SUA_APERTURE_SHARED 2 +struct drm_amdgpu_virtual_range { + uint64_t start; + uint64_t end; +}; + +/* query pin memory capability */ +#define AMDGPU_CAPABILITY_PIN_MEM_FLAG (1 << 0) +/* query direct gma capability */ +#define AMDGPU_CAPABILITY_DIRECT_GMA_FLAG (1 << 1) + +struct drm_amdgpu_capability { + uint32_t flag; + uint32_t direct_gma_size; +}; + + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/drm.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/drm.h new file mode 100644 index 0000000000..4e4f7c2c39 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/drm.h @@ -0,0 +1,1408 @@ +/* + * Header for the Direct Rendering Manager + * + * Author: Rickard E. (Rik) Faith + * + * Acknowledgments: + * Dec 1999, Richard Henderson , move to generic cmpxchg. + */ + +/* + * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DRM_H_ +#define _DRM_H_ + +#if defined(__linux__) + +#include +#include +typedef unsigned int drm_handle_t; + +#else /* One of the BSDs */ + +#include +#include +#include +typedef int8_t __s8; +typedef uint8_t __u8; +typedef int16_t __s16; +typedef uint16_t __u16; +typedef int32_t __s32; +typedef uint32_t __u32; +typedef int64_t __s64; +typedef uint64_t __u64; +typedef size_t __kernel_size_t; +typedef unsigned long drm_handle_t; + +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_NAME "drm" /**< Name in kernel, /dev, and /proc */ +#define DRM_MIN_ORDER 5 /**< At least 2^5 bytes = 32 bytes */ +#define DRM_MAX_ORDER 22 /**< Up to 2^22 bytes = 4MB */ +#define DRM_RAM_PERCENT 10 /**< How much system ram can we lock? */ + +#define _DRM_LOCK_HELD 0x80000000U /**< Hardware lock is held */ +#define _DRM_LOCK_CONT 0x40000000U /**< Hardware lock is contended */ +#define _DRM_LOCK_IS_HELD(lock) ((lock) & _DRM_LOCK_HELD) +#define _DRM_LOCK_IS_CONT(lock) ((lock) & _DRM_LOCK_CONT) +#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT)) + +typedef unsigned int drm_context_t; +typedef unsigned int drm_drawable_t; +typedef unsigned int drm_magic_t; + +/* + * Cliprect. + * + * \warning: If you change this structure, make sure you change + * XF86DRIClipRectRec in the server as well + * + * \note KW: Actually it's illegal to change either for + * backwards-compatibility reasons. + */ +struct drm_clip_rect { + unsigned short x1; + unsigned short y1; + unsigned short x2; + unsigned short y2; +}; + +/* + * Drawable information. + */ +struct drm_drawable_info { + unsigned int num_rects; + struct drm_clip_rect *rects; +}; + +/* + * Texture region, + */ +struct drm_tex_region { + unsigned char next; + unsigned char prev; + unsigned char in_use; + unsigned char padding; + unsigned int age; +}; + +/* + * Hardware lock. + * + * The lock structure is a simple cache-line aligned integer. To avoid + * processor bus contention on a multiprocessor system, there should not be any + * other data stored in the same cache line. + */ +struct drm_hw_lock { + __volatile__ unsigned int lock; /**< lock variable */ + char padding[60]; /**< Pad to cache line */ +}; + +/* + * DRM_IOCTL_VERSION ioctl argument type. + * + * \sa drmGetVersion(). + */ +struct drm_version { + int version_major; /**< Major version */ + int version_minor; /**< Minor version */ + int version_patchlevel; /**< Patch level */ + __kernel_size_t name_len; /**< Length of name buffer */ + char *name; /**< Name of driver */ + __kernel_size_t date_len; /**< Length of date buffer */ + char *date; /**< User-space buffer to hold date */ + __kernel_size_t desc_len; /**< Length of desc buffer */ + char *desc; /**< User-space buffer to hold desc */ +}; + +/* + * DRM_IOCTL_GET_UNIQUE ioctl argument type. + * + * \sa drmGetBusid() and drmSetBusId(). + */ +struct drm_unique { + __kernel_size_t unique_len; /**< Length of unique */ + char *unique; /**< Unique name for driver instantiation */ +}; + +struct drm_list { + int count; /**< Length of user-space structures */ + struct drm_version *version; +}; + +struct drm_block { + int unused; +}; + +/* + * DRM_IOCTL_CONTROL ioctl argument type. + * + * \sa drmCtlInstHandler() and drmCtlUninstHandler(). + */ +struct drm_control { + enum { + DRM_ADD_COMMAND, + DRM_RM_COMMAND, + DRM_INST_HANDLER, + DRM_UNINST_HANDLER + } func; + int irq; +}; + +/* + * Type of memory to map. + */ +enum drm_map_type { + _DRM_FRAME_BUFFER = 0, /**< WC (no caching), no core dump */ + _DRM_REGISTERS = 1, /**< no caching, no core dump */ + _DRM_SHM = 2, /**< shared, cached */ + _DRM_AGP = 3, /**< AGP/GART */ + _DRM_SCATTER_GATHER = 4, /**< Scatter/gather memory for PCI DMA */ + _DRM_CONSISTENT = 5 /**< Consistent memory for PCI DMA */ +}; + +/* + * Memory mapping flags. + */ +enum drm_map_flags { + _DRM_RESTRICTED = 0x01, /**< Cannot be mapped to user-virtual */ + _DRM_READ_ONLY = 0x02, + _DRM_LOCKED = 0x04, /**< shared, cached, locked */ + _DRM_KERNEL = 0x08, /**< kernel requires access */ + _DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */ + _DRM_CONTAINS_LOCK = 0x20, /**< SHM page that contains lock */ + _DRM_REMOVABLE = 0x40, /**< Removable mapping */ + _DRM_DRIVER = 0x80 /**< Managed by driver */ +}; + +struct drm_ctx_priv_map { + unsigned int ctx_id; /**< Context requesting private mapping */ + void *handle; /**< Handle of map */ +}; + +/* + * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls + * argument type. + * + * \sa drmAddMap(). + */ +struct drm_map { + unsigned long offset; /**< Requested physical address (0 for SAREA)*/ + unsigned long size; /**< Requested physical size (bytes) */ + enum drm_map_type type; /**< Type of memory to map */ + enum drm_map_flags flags; /**< Flags */ + void *handle; /**< User-space: "Handle" to pass to mmap() */ + /**< Kernel-space: kernel-virtual address */ + int mtrr; /**< MTRR slot used */ + /* Private data */ +}; + +/* + * DRM_IOCTL_GET_CLIENT ioctl argument type. + */ +struct drm_client { + int idx; /**< Which client desired? */ + int auth; /**< Is client authenticated? */ + unsigned long pid; /**< Process ID */ + unsigned long uid; /**< User ID */ + unsigned long magic; /**< Magic */ + unsigned long iocs; /**< Ioctl count */ +}; + +enum drm_stat_type { + _DRM_STAT_LOCK, + _DRM_STAT_OPENS, + _DRM_STAT_CLOSES, + _DRM_STAT_IOCTLS, + _DRM_STAT_LOCKS, + _DRM_STAT_UNLOCKS, + _DRM_STAT_VALUE, /**< Generic value */ + _DRM_STAT_BYTE, /**< Generic byte counter (1024bytes/K) */ + _DRM_STAT_COUNT, /**< Generic non-byte counter (1000/k) */ + + _DRM_STAT_IRQ, /**< IRQ */ + _DRM_STAT_PRIMARY, /**< Primary DMA bytes */ + _DRM_STAT_SECONDARY, /**< Secondary DMA bytes */ + _DRM_STAT_DMA, /**< DMA */ + _DRM_STAT_SPECIAL, /**< Special DMA (e.g., priority or polled) */ + _DRM_STAT_MISSED /**< Missed DMA opportunity */ + /* Add to the *END* of the list */ +}; + +/* + * DRM_IOCTL_GET_STATS ioctl argument type. + */ +struct drm_stats { + unsigned long count; + struct { + unsigned long value; + enum drm_stat_type type; + } data[15]; +}; + +/* + * Hardware locking flags. + */ +enum drm_lock_flags { + _DRM_LOCK_READY = 0x01, /**< Wait until hardware is ready for DMA */ + _DRM_LOCK_QUIESCENT = 0x02, /**< Wait until hardware quiescent */ + _DRM_LOCK_FLUSH = 0x04, /**< Flush this context's DMA queue first */ + _DRM_LOCK_FLUSH_ALL = 0x08, /**< Flush all DMA queues first */ + /* These *HALT* flags aren't supported yet + -- they will be used to support the + full-screen DGA-like mode. */ + _DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */ + _DRM_HALT_CUR_QUEUES = 0x20 /**< Halt all current queues */ +}; + +/* + * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type. + * + * \sa drmGetLock() and drmUnlock(). + */ +struct drm_lock { + int context; + enum drm_lock_flags flags; +}; + +/* + * DMA flags + * + * \warning + * These values \e must match xf86drm.h. + * + * \sa drm_dma. + */ +enum drm_dma_flags { + /* Flags for DMA buffer dispatch */ + _DRM_DMA_BLOCK = 0x01, /**< + * Block until buffer dispatched. + * + * \note The buffer may not yet have + * been processed by the hardware -- + * getting a hardware lock with the + * hardware quiescent will ensure + * that the buffer has been + * processed. + */ + _DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */ + _DRM_DMA_PRIORITY = 0x04, /**< High priority dispatch */ + + /* Flags for DMA buffer request */ + _DRM_DMA_WAIT = 0x10, /**< Wait for free buffers */ + _DRM_DMA_SMALLER_OK = 0x20, /**< Smaller-than-requested buffers OK */ + _DRM_DMA_LARGER_OK = 0x40 /**< Larger-than-requested buffers OK */ +}; + +/* + * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type. + * + * \sa drmAddBufs(). + */ +struct drm_buf_desc { + int count; /**< Number of buffers of this size */ + int size; /**< Size in bytes */ + int low_mark; /**< Low water mark */ + int high_mark; /**< High water mark */ + enum { + _DRM_PAGE_ALIGN = 0x01, /**< Align on page boundaries for DMA */ + _DRM_AGP_BUFFER = 0x02, /**< Buffer is in AGP space */ + _DRM_SG_BUFFER = 0x04, /**< Scatter/gather memory buffer */ + _DRM_FB_BUFFER = 0x08, /**< Buffer is in frame buffer */ + _DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */ + } flags; + unsigned long agp_start; /**< + * Start address of where the AGP buffers are + * in the AGP aperture + */ +}; + +/* + * DRM_IOCTL_INFO_BUFS ioctl argument type. + */ +struct drm_buf_info { + int count; /**< Entries in list */ + struct drm_buf_desc *list; +}; + +/* + * DRM_IOCTL_FREE_BUFS ioctl argument type. + */ +struct drm_buf_free { + int count; + int *list; +}; + +/* + * Buffer information + * + * \sa drm_buf_map. + */ +struct drm_buf_pub { + int idx; /**< Index into the master buffer list */ + int total; /**< Buffer size */ + int used; /**< Amount of buffer in use (for DMA) */ + void *address; /**< Address of buffer */ +}; + +/* + * DRM_IOCTL_MAP_BUFS ioctl argument type. + */ +struct drm_buf_map { + int count; /**< Length of the buffer list */ +#ifdef __cplusplus + void *virt; +#else + void *virtual; /**< Mmap'd area in user-virtual */ +#endif + struct drm_buf_pub *list; /**< Buffer information */ +}; + +/* + * DRM_IOCTL_DMA ioctl argument type. + * + * Indices here refer to the offset into the buffer list in drm_buf_get. + * + * \sa drmDMA(). + */ +struct drm_dma { + int context; /**< Context handle */ + int send_count; /**< Number of buffers to send */ + int *send_indices; /**< List of handles to buffers */ + int *send_sizes; /**< Lengths of data to send */ + enum drm_dma_flags flags; /**< Flags */ + int request_count; /**< Number of buffers requested */ + int request_size; /**< Desired size for buffers */ + int *request_indices; /**< Buffer information */ + int *request_sizes; + int granted_count; /**< Number of buffers granted */ +}; + +enum drm_ctx_flags { + _DRM_CONTEXT_PRESERVED = 0x01, + _DRM_CONTEXT_2DONLY = 0x02 +}; + +/* + * DRM_IOCTL_ADD_CTX ioctl argument type. + * + * \sa drmCreateContext() and drmDestroyContext(). + */ +struct drm_ctx { + drm_context_t handle; + enum drm_ctx_flags flags; +}; + +/* + * DRM_IOCTL_RES_CTX ioctl argument type. + */ +struct drm_ctx_res { + int count; + struct drm_ctx *contexts; +}; + +/* + * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type. + */ +struct drm_draw { + drm_drawable_t handle; +}; + +/* + * DRM_IOCTL_UPDATE_DRAW ioctl argument type. + */ +typedef enum { + DRM_DRAWABLE_CLIPRECTS +} drm_drawable_info_type_t; + +struct drm_update_draw { + drm_drawable_t handle; + unsigned int type; + unsigned int num; + unsigned long long data; +}; + +/* + * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type. + */ +struct drm_auth { + drm_magic_t magic; +}; + +/* + * DRM_IOCTL_IRQ_BUSID ioctl argument type. + * + * \sa drmGetInterruptFromBusID(). + */ +struct drm_irq_busid { + int irq; /**< IRQ number */ + int busnum; /**< bus number */ + int devnum; /**< device number */ + int funcnum; /**< function number */ +}; + +enum drm_vblank_seq_type { + _DRM_VBLANK_ABSOLUTE = 0x0, /**< Wait for specific vblank sequence number */ + _DRM_VBLANK_RELATIVE = 0x1, /**< Wait for given number of vblanks */ + /* bits 1-6 are reserved for high crtcs */ + _DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e, + _DRM_VBLANK_EVENT = 0x4000000, /**< Send event instead of blocking */ + _DRM_VBLANK_FLIP = 0x8000000, /**< Scheduled buffer swap should flip */ + _DRM_VBLANK_NEXTONMISS = 0x10000000, /**< If missed, wait for next vblank */ + _DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ + _DRM_VBLANK_SIGNAL = 0x40000000 /**< Send signal instead of blocking, unsupported */ +}; +#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1 + +#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE) +#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \ + _DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS) + +struct drm_wait_vblank_request { + enum drm_vblank_seq_type type; + unsigned int sequence; + unsigned long signal; +}; + +struct drm_wait_vblank_reply { + enum drm_vblank_seq_type type; + unsigned int sequence; + long tval_sec; + long tval_usec; +}; + +/* + * DRM_IOCTL_WAIT_VBLANK ioctl argument type. + * + * \sa drmWaitVBlank(). + */ +union drm_wait_vblank { + struct drm_wait_vblank_request request; + struct drm_wait_vblank_reply reply; +}; + +#define _DRM_PRE_MODESET 1 +#define _DRM_POST_MODESET 2 + +/* + * DRM_IOCTL_MODESET_CTL ioctl argument type + * + * \sa drmModesetCtl(). + */ +struct drm_modeset_ctl { + __u32 crtc; + __u32 cmd; +}; + +/* + * DRM_IOCTL_AGP_ENABLE ioctl argument type. + * + * \sa drmAgpEnable(). + */ +struct drm_agp_mode { + unsigned long mode; /**< AGP mode */ +}; + +/* + * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type. + * + * \sa drmAgpAlloc() and drmAgpFree(). + */ +struct drm_agp_buffer { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for binding / unbinding */ + unsigned long type; /**< Type of memory to allocate */ + unsigned long physical; /**< Physical used by i810 */ +}; + +/* + * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type. + * + * \sa drmAgpBind() and drmAgpUnbind(). + */ +struct drm_agp_binding { + unsigned long handle; /**< From drm_agp_buffer */ + unsigned long offset; /**< In bytes -- will round to page boundary */ +}; + +/* + * DRM_IOCTL_AGP_INFO ioctl argument type. + * + * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(), + * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(), + * drmAgpVendorId() and drmAgpDeviceId(). + */ +struct drm_agp_info { + int agp_version_major; + int agp_version_minor; + unsigned long mode; + unsigned long aperture_base; /* physical address */ + unsigned long aperture_size; /* bytes */ + unsigned long memory_allowed; /* bytes */ + unsigned long memory_used; + + /* PCI information */ + unsigned short id_vendor; + unsigned short id_device; +}; + +/* + * DRM_IOCTL_SG_ALLOC ioctl argument type. + */ +struct drm_scatter_gather { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for mapping / unmapping */ +}; + +/* + * DRM_IOCTL_SET_VERSION ioctl argument type. + */ +struct drm_set_version { + int drm_di_major; + int drm_di_minor; + int drm_dd_major; + int drm_dd_minor; +}; + +/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ +struct drm_gem_close { + /** Handle of the object to be closed. */ + __u32 handle; + __u32 pad; +}; + +/* DRM_IOCTL_GEM_FLINK ioctl argument type */ +struct drm_gem_flink { + /** Handle for the object being named */ + __u32 handle; + + /** Returned global name */ + __u32 name; +}; + +/* DRM_IOCTL_GEM_OPEN ioctl argument type */ +struct drm_gem_open { + /** Name of object being opened */ + __u32 name; + + /** Returned handle for the object */ + __u32 handle; + + /** Returned size of the object */ + __u64 size; +}; + +/** + * DRM_CAP_DUMB_BUFFER + * + * If set to 1, the driver supports creating dumb buffers via the + * &DRM_IOCTL_MODE_CREATE_DUMB ioctl. + */ +#define DRM_CAP_DUMB_BUFFER 0x1 +/** + * DRM_CAP_VBLANK_HIGH_CRTC + * + * If set to 1, the kernel supports specifying a :ref:`CRTC index` + * in the high bits of &drm_wait_vblank_request.type. + * + * Starting kernel version 2.6.39, this capability is always set to 1. + */ +#define DRM_CAP_VBLANK_HIGH_CRTC 0x2 +/** + * DRM_CAP_DUMB_PREFERRED_DEPTH + * + * The preferred bit depth for dumb buffers. + * + * The bit depth is the number of bits used to indicate the color of a single + * pixel excluding any padding. This is different from the number of bits per + * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per + * pixel. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 +/** + * DRM_CAP_DUMB_PREFER_SHADOW + * + * If set to 1, the driver prefers userspace to render to a shadow buffer + * instead of directly rendering to a dumb buffer. For best speed, userspace + * should do streaming ordered memory copies into the dumb buffer and never + * read from it. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFER_SHADOW 0x4 +/** + * DRM_CAP_PRIME + * + * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT + * and &DRM_PRIME_CAP_EXPORT. + * + * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and + * &DRM_PRIME_CAP_EXPORT are always advertised. + * + * PRIME buffers are exposed as dma-buf file descriptors. + * See :ref:`prime_buffer_sharing`. + */ +#define DRM_CAP_PRIME 0x5 +/** + * DRM_PRIME_CAP_IMPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME + * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_IMPORT 0x1 +/** + * DRM_PRIME_CAP_EXPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME + * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_EXPORT 0x2 +/** + * DRM_CAP_TIMESTAMP_MONOTONIC + * + * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in + * struct drm_event_vblank. If set to 1, the kernel will report timestamps with + * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these + * clocks. + * + * Starting from kernel version 2.6.39, the default value for this capability + * is 1. Starting kernel version 4.15, this capability is always set to 1. + */ +#define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 +/** + * DRM_CAP_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy + * page-flips. + */ +#define DRM_CAP_ASYNC_PAGE_FLIP 0x7 +/** + * DRM_CAP_CURSOR_WIDTH + * + * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid + * width x height combination for the hardware cursor. The intention is that a + * hardware agnostic userspace can query a cursor plane size to use. + * + * Note that the cross-driver contract is to merely return a valid size; + * drivers are free to attach another meaning on top, eg. i915 returns the + * maximum plane size. + */ +#define DRM_CAP_CURSOR_WIDTH 0x8 +/** + * DRM_CAP_CURSOR_HEIGHT + * + * See &DRM_CAP_CURSOR_WIDTH. + */ +#define DRM_CAP_CURSOR_HEIGHT 0x9 +/** + * DRM_CAP_ADDFB2_MODIFIERS + * + * If set to 1, the driver supports supplying modifiers in the + * &DRM_IOCTL_MODE_ADDFB2 ioctl. + */ +#define DRM_CAP_ADDFB2_MODIFIERS 0x10 +/** + * DRM_CAP_PAGE_FLIP_TARGET + * + * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and + * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in + * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP + * ioctl. + */ +#define DRM_CAP_PAGE_FLIP_TARGET 0x11 +/** + * DRM_CAP_CRTC_IN_VBLANK_EVENT + * + * If set to 1, the kernel supports reporting the CRTC ID in + * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and + * &DRM_EVENT_FLIP_COMPLETE events. + * + * Starting kernel version 4.12, this capability is always set to 1. + */ +#define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 +/** + * DRM_CAP_SYNCOBJ + * + * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ 0x13 +/** + * DRM_CAP_SYNCOBJ_TIMELINE + * + * If set to 1, the driver supports timeline operations on sync objects. See + * :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ_TIMELINE 0x14 +/** + * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic + * commits. + */ +#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP 0x15 + +/* DRM_IOCTL_GET_CAP ioctl argument type */ +struct drm_get_cap { + __u64 capability; + __u64 value; +}; + +/** + * DRM_CLIENT_CAP_STEREO_3D + * + * If set to 1, the DRM core will expose the stereo 3D capabilities of the + * monitor by advertising the supported 3D layouts in the flags of struct + * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 3.13. + */ +#define DRM_CLIENT_CAP_STEREO_3D 1 + +/** + * DRM_CLIENT_CAP_UNIVERSAL_PLANES + * + * If set to 1, the DRM core will expose all planes (overlay, primary, and + * cursor) to userspace. + * + * This capability has been introduced in kernel version 3.15. Starting from + * kernel version 3.17, this capability is always supported for all drivers. + */ +#define DRM_CLIENT_CAP_UNIVERSAL_PLANES 2 + +/** + * DRM_CLIENT_CAP_ATOMIC + * + * If set to 1, the DRM core will expose atomic properties to userspace. This + * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and + * &DRM_CLIENT_CAP_ASPECT_RATIO. + * + * If the driver doesn't support atomic mode-setting, enabling this capability + * will fail with -EOPNOTSUPP. + * + * This capability has been introduced in kernel version 4.0. Starting from + * kernel version 4.2, this capability is always supported for atomic-capable + * drivers. + */ +#define DRM_CLIENT_CAP_ATOMIC 3 + +/** + * DRM_CLIENT_CAP_ASPECT_RATIO + * + * If set to 1, the DRM core will provide aspect ratio information in modes. + * See ``DRM_MODE_FLAG_PIC_AR_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 4.18. + */ +#define DRM_CLIENT_CAP_ASPECT_RATIO 4 + +/** + * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS + * + * If set to 1, the DRM core will expose special connectors to be used for + * writing back to memory the scene setup in the commit. The client must enable + * &DRM_CLIENT_CAP_ATOMIC first. + * + * This capability is always supported for atomic-capable drivers starting from + * kernel version 4.19. + */ +#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS 5 + +/** + * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT + * + * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and + * virtualbox) have additional restrictions for cursor planes (thus + * making cursor planes on those drivers not truly universal,) e.g. + * they need cursor planes to act like one would expect from a mouse + * cursor and have correctly set hotspot properties. + * If this client cap is not set the DRM core will hide cursor plane on + * those virtualized drivers because not setting it implies that the + * client is not capable of dealing with those extra restictions. + * Clients which do set cursor hotspot and treat the cursor plane + * like a mouse cursor should set this property. + * The client must enable &DRM_CLIENT_CAP_ATOMIC first. + * + * Setting this property on drivers which do not special case + * cursor planes (i.e. non-virtualized drivers) will return + * EOPNOTSUPP, which can be used by userspace to gauge + * requirements of the hardware/drivers they're running on. + * + * This capability is always supported for atomic-capable virtualized + * drivers starting from kernel version 6.6. + */ +#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT 6 + +/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ +struct drm_set_client_cap { + __u64 capability; + __u64 value; +}; + +#define DRM_RDWR O_RDWR +#define DRM_CLOEXEC O_CLOEXEC +struct drm_prime_handle { + __u32 handle; + + /** Flags.. only applicable for handle->fd */ + __u32 flags; + + /** Returned dmabuf file descriptor */ + __s32 fd; +}; + +struct drm_syncobj_create { + __u32 handle; +#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0) + __u32 flags; +}; + +struct drm_syncobj_destroy { + __u32 handle; + __u32 pad; +}; + +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) +struct drm_syncobj_handle { + __u32 handle; + __u32 flags; + + __s32 fd; + __u32 pad; +}; + +struct drm_syncobj_transfer { + __u32 src_handle; + __u32 dst_handle; + __u64 src_point; + __u64 dst_point; + __u32 flags; + __u32 pad; +}; + +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */ +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */ +struct drm_syncobj_wait { + __u64 handles; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +struct drm_syncobj_timeline_wait { + __u64 handles; + /* wait on specific timeline point for every handles*/ + __u64 points; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +/** + * struct drm_syncobj_eventfd + * @handle: syncobj handle. + * @flags: Zero to wait for the point to be signalled, or + * &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be + * available for the point. + * @point: syncobj timeline point (set to zero for binary syncobjs). + * @fd: Existing eventfd to sent events to. + * @pad: Must be zero. + * + * Register an eventfd to be signalled by a syncobj. The eventfd counter will + * be incremented by one. + */ +struct drm_syncobj_eventfd { + __u32 handle; + __u32 flags; + __u64 point; + __s32 fd; + __u32 pad; +}; + + +struct drm_syncobj_array { + __u64 handles; + __u32 count_handles; + __u32 pad; +}; + +#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */ +struct drm_syncobj_timeline_array { + __u64 handles; + __u64 points; + __u32 count_handles; + __u32 flags; +}; + + +/* Query current scanout sequence number */ +struct drm_crtc_get_sequence { + __u32 crtc_id; /* requested crtc_id */ + __u32 active; /* return: crtc output is active */ + __u64 sequence; /* return: most recent vblank sequence */ + __s64 sequence_ns; /* return: most recent time of first pixel out */ +}; + +/* Queue event to be delivered at specified sequence. Time stamp marks + * when the first pixel of the refresh cycle leaves the display engine + * for the display + */ +#define DRM_CRTC_SEQUENCE_RELATIVE 0x00000001 /* sequence is relative to current */ +#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS 0x00000002 /* Use next sequence if we've missed */ + +struct drm_crtc_queue_sequence { + __u32 crtc_id; + __u32 flags; + __u64 sequence; /* on input, target sequence. on output, actual sequence */ + __u64 user_data; /* user data passed to event */ +}; + +#if defined(__cplusplus) +} +#endif + +#include "drm_mode.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_IOCTL_BASE 'd' +#define DRM_IO(nr) _IO(DRM_IOCTL_BASE,nr) +#define DRM_IOR(nr,type) _IOR(DRM_IOCTL_BASE,nr,type) +#define DRM_IOW(nr,type) _IOW(DRM_IOCTL_BASE,nr,type) +#define DRM_IOWR(nr,type) _IOWR(DRM_IOCTL_BASE,nr,type) + +#define DRM_IOCTL_VERSION DRM_IOWR(0x00, struct drm_version) +#define DRM_IOCTL_GET_UNIQUE DRM_IOWR(0x01, struct drm_unique) +#define DRM_IOCTL_GET_MAGIC DRM_IOR( 0x02, struct drm_auth) +#define DRM_IOCTL_IRQ_BUSID DRM_IOWR(0x03, struct drm_irq_busid) +#define DRM_IOCTL_GET_MAP DRM_IOWR(0x04, struct drm_map) +#define DRM_IOCTL_GET_CLIENT DRM_IOWR(0x05, struct drm_client) +#define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, struct drm_stats) +#define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, struct drm_set_version) +#define DRM_IOCTL_MODESET_CTL DRM_IOW(0x08, struct drm_modeset_ctl) +/** + * DRM_IOCTL_GEM_CLOSE - Close a GEM handle. + * + * GEM handles are not reference-counted by the kernel. User-space is + * responsible for managing their lifetime. For example, if user-space imports + * the same memory object twice on the same DRM file description, the same GEM + * handle is returned by both imports, and user-space needs to ensure + * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen + * when a memory object is allocated, then exported and imported again on the + * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception + * and always returns fresh new GEM handles even if an existing GEM handle + * already refers to the same memory object before the IOCTL is performed. + */ +#define DRM_IOCTL_GEM_CLOSE DRM_IOW (0x09, struct drm_gem_close) +#define DRM_IOCTL_GEM_FLINK DRM_IOWR(0x0a, struct drm_gem_flink) +#define DRM_IOCTL_GEM_OPEN DRM_IOWR(0x0b, struct drm_gem_open) +#define DRM_IOCTL_GET_CAP DRM_IOWR(0x0c, struct drm_get_cap) +#define DRM_IOCTL_SET_CLIENT_CAP DRM_IOW( 0x0d, struct drm_set_client_cap) + +#define DRM_IOCTL_SET_UNIQUE DRM_IOW( 0x10, struct drm_unique) +#define DRM_IOCTL_AUTH_MAGIC DRM_IOW( 0x11, struct drm_auth) +#define DRM_IOCTL_BLOCK DRM_IOWR(0x12, struct drm_block) +#define DRM_IOCTL_UNBLOCK DRM_IOWR(0x13, struct drm_block) +#define DRM_IOCTL_CONTROL DRM_IOW( 0x14, struct drm_control) +#define DRM_IOCTL_ADD_MAP DRM_IOWR(0x15, struct drm_map) +#define DRM_IOCTL_ADD_BUFS DRM_IOWR(0x16, struct drm_buf_desc) +#define DRM_IOCTL_MARK_BUFS DRM_IOW( 0x17, struct drm_buf_desc) +#define DRM_IOCTL_INFO_BUFS DRM_IOWR(0x18, struct drm_buf_info) +#define DRM_IOCTL_MAP_BUFS DRM_IOWR(0x19, struct drm_buf_map) +#define DRM_IOCTL_FREE_BUFS DRM_IOW( 0x1a, struct drm_buf_free) + +#define DRM_IOCTL_RM_MAP DRM_IOW( 0x1b, struct drm_map) + +#define DRM_IOCTL_SET_SAREA_CTX DRM_IOW( 0x1c, struct drm_ctx_priv_map) +#define DRM_IOCTL_GET_SAREA_CTX DRM_IOWR(0x1d, struct drm_ctx_priv_map) + +#define DRM_IOCTL_SET_MASTER DRM_IO(0x1e) +#define DRM_IOCTL_DROP_MASTER DRM_IO(0x1f) + +#define DRM_IOCTL_ADD_CTX DRM_IOWR(0x20, struct drm_ctx) +#define DRM_IOCTL_RM_CTX DRM_IOWR(0x21, struct drm_ctx) +#define DRM_IOCTL_MOD_CTX DRM_IOW( 0x22, struct drm_ctx) +#define DRM_IOCTL_GET_CTX DRM_IOWR(0x23, struct drm_ctx) +#define DRM_IOCTL_SWITCH_CTX DRM_IOW( 0x24, struct drm_ctx) +#define DRM_IOCTL_NEW_CTX DRM_IOW( 0x25, struct drm_ctx) +#define DRM_IOCTL_RES_CTX DRM_IOWR(0x26, struct drm_ctx_res) +#define DRM_IOCTL_ADD_DRAW DRM_IOWR(0x27, struct drm_draw) +#define DRM_IOCTL_RM_DRAW DRM_IOWR(0x28, struct drm_draw) +#define DRM_IOCTL_DMA DRM_IOWR(0x29, struct drm_dma) +#define DRM_IOCTL_LOCK DRM_IOW( 0x2a, struct drm_lock) +#define DRM_IOCTL_UNLOCK DRM_IOW( 0x2b, struct drm_lock) +#define DRM_IOCTL_FINISH DRM_IOW( 0x2c, struct drm_lock) + +/** + * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD. + * + * User-space sets &drm_prime_handle.handle with the GEM handle to export and + * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in + * &drm_prime_handle.fd. + * + * The export can fail for any driver-specific reason, e.g. because export is + * not supported for this specific GEM handle (but might be for others). + * + * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT. + */ +#define DRM_IOCTL_PRIME_HANDLE_TO_FD DRM_IOWR(0x2d, struct drm_prime_handle) +/** + * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle. + * + * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to + * import, and gets back a GEM handle in &drm_prime_handle.handle. + * &drm_prime_handle.flags is unused. + * + * If an existing GEM handle refers to the memory object backing the DMA-BUF, + * that GEM handle is returned. Therefore user-space which needs to handle + * arbitrary DMA-BUFs must have a user-space lookup data structure to manually + * reference-count duplicated GEM handles. For more information see + * &DRM_IOCTL_GEM_CLOSE. + * + * The import can fail for any driver-specific reason, e.g. because import is + * only supported for DMA-BUFs allocated on this DRM device. + * + * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT. + */ +#define DRM_IOCTL_PRIME_FD_TO_HANDLE DRM_IOWR(0x2e, struct drm_prime_handle) + +#define DRM_IOCTL_AGP_ACQUIRE DRM_IO( 0x30) +#define DRM_IOCTL_AGP_RELEASE DRM_IO( 0x31) +#define DRM_IOCTL_AGP_ENABLE DRM_IOW( 0x32, struct drm_agp_mode) +#define DRM_IOCTL_AGP_INFO DRM_IOR( 0x33, struct drm_agp_info) +#define DRM_IOCTL_AGP_ALLOC DRM_IOWR(0x34, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_FREE DRM_IOW( 0x35, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_BIND DRM_IOW( 0x36, struct drm_agp_binding) +#define DRM_IOCTL_AGP_UNBIND DRM_IOW( 0x37, struct drm_agp_binding) + +#define DRM_IOCTL_SG_ALLOC DRM_IOWR(0x38, struct drm_scatter_gather) +#define DRM_IOCTL_SG_FREE DRM_IOW( 0x39, struct drm_scatter_gather) + +#define DRM_IOCTL_WAIT_VBLANK DRM_IOWR(0x3a, union drm_wait_vblank) + +#define DRM_IOCTL_CRTC_GET_SEQUENCE DRM_IOWR(0x3b, struct drm_crtc_get_sequence) +#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE DRM_IOWR(0x3c, struct drm_crtc_queue_sequence) + +#define DRM_IOCTL_UPDATE_DRAW DRM_IOW(0x3f, struct drm_update_draw) + +#define DRM_IOCTL_MODE_GETRESOURCES DRM_IOWR(0xA0, struct drm_mode_card_res) +#define DRM_IOCTL_MODE_GETCRTC DRM_IOWR(0xA1, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_SETCRTC DRM_IOWR(0xA2, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_CURSOR DRM_IOWR(0xA3, struct drm_mode_cursor) +#define DRM_IOCTL_MODE_GETGAMMA DRM_IOWR(0xA4, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_SETGAMMA DRM_IOWR(0xA5, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_GETENCODER DRM_IOWR(0xA6, struct drm_mode_get_encoder) +#define DRM_IOCTL_MODE_GETCONNECTOR DRM_IOWR(0xA7, struct drm_mode_get_connector) +#define DRM_IOCTL_MODE_ATTACHMODE DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */ +#define DRM_IOCTL_MODE_DETACHMODE DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */ + +#define DRM_IOCTL_MODE_GETPROPERTY DRM_IOWR(0xAA, struct drm_mode_get_property) +#define DRM_IOCTL_MODE_SETPROPERTY DRM_IOWR(0xAB, struct drm_mode_connector_set_property) +#define DRM_IOCTL_MODE_GETPROPBLOB DRM_IOWR(0xAC, struct drm_mode_get_blob) +#define DRM_IOCTL_MODE_GETFB DRM_IOWR(0xAD, struct drm_mode_fb_cmd) +#define DRM_IOCTL_MODE_ADDFB DRM_IOWR(0xAE, struct drm_mode_fb_cmd) +/** + * DRM_IOCTL_MODE_RMFB - Remove a framebuffer. + * + * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * Warning: removing a framebuffer currently in-use on an enabled plane will + * disable that plane. The CRTC the plane is linked to may also be disabled + * (depending on driver capabilities). + */ +#define DRM_IOCTL_MODE_RMFB DRM_IOWR(0xAF, unsigned int) +#define DRM_IOCTL_MODE_PAGE_FLIP DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip) +#define DRM_IOCTL_MODE_DIRTYFB DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd) + +/** + * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object. + * + * KMS dumb buffers provide a very primitive way to allocate a buffer object + * suitable for scanout and map it for software rendering. KMS dumb buffers are + * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb + * buffers are not suitable to be displayed on any other device than the KMS + * device where they were allocated from. Also see + * :ref:`kms_dumb_buffer_objects`. + * + * The IOCTL argument is a struct drm_mode_create_dumb. + * + * User-space is expected to create a KMS dumb buffer via this IOCTL, then add + * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via + * &DRM_IOCTL_MODE_MAP_DUMB. + * + * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported. + * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate + * driver preferences for dumb buffers. + */ +#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb) +#define DRM_IOCTL_MODE_MAP_DUMB DRM_IOWR(0xB3, struct drm_mode_map_dumb) +#define DRM_IOCTL_MODE_DESTROY_DUMB DRM_IOWR(0xB4, struct drm_mode_destroy_dumb) +#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res) +#define DRM_IOCTL_MODE_GETPLANE DRM_IOWR(0xB6, struct drm_mode_get_plane) +#define DRM_IOCTL_MODE_SETPLANE DRM_IOWR(0xB7, struct drm_mode_set_plane) +#define DRM_IOCTL_MODE_ADDFB2 DRM_IOWR(0xB8, struct drm_mode_fb_cmd2) +#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES DRM_IOWR(0xB9, struct drm_mode_obj_get_properties) +#define DRM_IOCTL_MODE_OBJ_SETPROPERTY DRM_IOWR(0xBA, struct drm_mode_obj_set_property) +#define DRM_IOCTL_MODE_CURSOR2 DRM_IOWR(0xBB, struct drm_mode_cursor2) +#define DRM_IOCTL_MODE_ATOMIC DRM_IOWR(0xBC, struct drm_mode_atomic) +#define DRM_IOCTL_MODE_CREATEPROPBLOB DRM_IOWR(0xBD, struct drm_mode_create_blob) +#define DRM_IOCTL_MODE_DESTROYPROPBLOB DRM_IOWR(0xBE, struct drm_mode_destroy_blob) + +#define DRM_IOCTL_SYNCOBJ_CREATE DRM_IOWR(0xBF, struct drm_syncobj_create) +#define DRM_IOCTL_SYNCOBJ_DESTROY DRM_IOWR(0xC0, struct drm_syncobj_destroy) +#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD DRM_IOWR(0xC1, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE DRM_IOWR(0xC2, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_WAIT DRM_IOWR(0xC3, struct drm_syncobj_wait) +#define DRM_IOCTL_SYNCOBJ_RESET DRM_IOWR(0xC4, struct drm_syncobj_array) +#define DRM_IOCTL_SYNCOBJ_SIGNAL DRM_IOWR(0xC5, struct drm_syncobj_array) + +#define DRM_IOCTL_MODE_CREATE_LEASE DRM_IOWR(0xC6, struct drm_mode_create_lease) +#define DRM_IOCTL_MODE_LIST_LESSEES DRM_IOWR(0xC7, struct drm_mode_list_lessees) +#define DRM_IOCTL_MODE_GET_LEASE DRM_IOWR(0xC8, struct drm_mode_get_lease) +#define DRM_IOCTL_MODE_REVOKE_LEASE DRM_IOWR(0xC9, struct drm_mode_revoke_lease) + +#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait) +#define DRM_IOCTL_SYNCOBJ_QUERY DRM_IOWR(0xCB, struct drm_syncobj_timeline_array) +#define DRM_IOCTL_SYNCOBJ_TRANSFER DRM_IOWR(0xCC, struct drm_syncobj_transfer) +#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL DRM_IOWR(0xCD, struct drm_syncobj_timeline_array) + +/** + * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata. + * + * This queries metadata about a framebuffer. User-space fills + * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the + * struct as the output. + * + * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles + * will be filled with GEM buffer handles. Fresh new GEM handles are always + * returned, even if another GEM handle referring to the same memory object + * already exists on the DRM file description. The caller is responsible for + * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same + * new handle will be returned for multiple planes in case they use the same + * memory object. Planes are valid until one has a zero handle -- this can be + * used to compute the number of planes. + * + * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid + * until one has a zero &drm_mode_fb_cmd2.pitches. + * + * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set + * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the + * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier. + * + * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space + * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately + * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not + * double-close handles which are specified multiple times in the array. + */ +#define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) + +#define DRM_IOCTL_SYNCOBJ_EVENTFD DRM_IOWR(0xCF, struct drm_syncobj_eventfd) + +/** + * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer. + * + * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable + * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept + * alive. When the plane no longer uses the framebuffer (because the + * framebuffer is replaced with another one, or the plane is disabled), the + * framebuffer is cleaned up. + * + * This is useful to implement flicker-free transitions between two processes. + * + * Depending on the threat model, user-space may want to ensure that the + * framebuffer doesn't expose any sensitive user information: closed + * framebuffers attached to a plane can be read back by the next DRM master. + */ +#define DRM_IOCTL_MODE_CLOSEFB DRM_IOWR(0xD0, struct drm_mode_closefb) + +/* + * Device specific ioctls should only be in their respective headers + * The device specific ioctl range is from 0x40 to 0x9f. + * Generic IOCTLS restart at 0xA0. + * + * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and + * drmCommandReadWrite(). + */ +#define DRM_COMMAND_BASE 0x40 +#define DRM_COMMAND_END 0xA0 + +/** + * struct drm_event - Header for DRM events + * @type: event type. + * @length: total number of payload bytes (including header). + * + * This struct is a header for events written back to user-space on the DRM FD. + * A read on the DRM FD will always only return complete events: e.g. if the + * read buffer is 100 bytes large and there are two 64 byte events pending, + * only one will be returned. + * + * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and + * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK, + * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE. + */ +struct drm_event { + __u32 type; + __u32 length; +}; + +/** + * DRM_EVENT_VBLANK - vertical blanking event + * + * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the + * &_DRM_VBLANK_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_VBLANK 0x01 +/** + * DRM_EVENT_FLIP_COMPLETE - page-flip completion event + * + * This event is sent in response to an atomic commit or legacy page-flip with + * the &DRM_MODE_PAGE_FLIP_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_FLIP_COMPLETE 0x02 +/** + * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event + * + * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE. + * + * The event payload is a struct drm_event_crtc_sequence. + */ +#define DRM_EVENT_CRTC_SEQUENCE 0x03 + +struct drm_event_vblank { + struct drm_event base; + __u64 user_data; + __u32 tv_sec; + __u32 tv_usec; + __u32 sequence; + __u32 crtc_id; /* 0 on older kernels that do not support this */ +}; + +/* Event delivered at sequence. Time stamp marks when the first pixel + * of the refresh cycle leaves the display engine for the display + */ +struct drm_event_crtc_sequence { + struct drm_event base; + __u64 user_data; + __s64 time_ns; + __u64 sequence; +}; + +/* typedef area */ +typedef struct drm_clip_rect drm_clip_rect_t; +typedef struct drm_drawable_info drm_drawable_info_t; +typedef struct drm_tex_region drm_tex_region_t; +typedef struct drm_hw_lock drm_hw_lock_t; +typedef struct drm_version drm_version_t; +typedef struct drm_unique drm_unique_t; +typedef struct drm_list drm_list_t; +typedef struct drm_block drm_block_t; +typedef struct drm_control drm_control_t; +typedef enum drm_map_type drm_map_type_t; +typedef enum drm_map_flags drm_map_flags_t; +typedef struct drm_ctx_priv_map drm_ctx_priv_map_t; +typedef struct drm_map drm_map_t; +typedef struct drm_client drm_client_t; +typedef enum drm_stat_type drm_stat_type_t; +typedef struct drm_stats drm_stats_t; +typedef enum drm_lock_flags drm_lock_flags_t; +typedef struct drm_lock drm_lock_t; +typedef enum drm_dma_flags drm_dma_flags_t; +typedef struct drm_buf_desc drm_buf_desc_t; +typedef struct drm_buf_info drm_buf_info_t; +typedef struct drm_buf_free drm_buf_free_t; +typedef struct drm_buf_pub drm_buf_pub_t; +typedef struct drm_buf_map drm_buf_map_t; +typedef struct drm_dma drm_dma_t; +typedef union drm_wait_vblank drm_wait_vblank_t; +typedef struct drm_agp_mode drm_agp_mode_t; +typedef enum drm_ctx_flags drm_ctx_flags_t; +typedef struct drm_ctx drm_ctx_t; +typedef struct drm_ctx_res drm_ctx_res_t; +typedef struct drm_draw drm_draw_t; +typedef struct drm_update_draw drm_update_draw_t; +typedef struct drm_auth drm_auth_t; +typedef struct drm_irq_busid drm_irq_busid_t; +typedef enum drm_vblank_seq_type drm_vblank_seq_type_t; + +typedef struct drm_agp_buffer drm_agp_buffer_t; +typedef struct drm_agp_binding drm_agp_binding_t; +typedef struct drm_agp_info drm_agp_info_t; +typedef struct drm_scatter_gather drm_scatter_gather_t; +typedef struct drm_set_version drm_set_version_t; + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/drm_mode.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/drm_mode.h new file mode 100644 index 0000000000..d390011b89 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/drm_mode.h @@ -0,0 +1,1360 @@ +/* + * Copyright (c) 2007 Dave Airlie + * Copyright (c) 2007 Jakob Bornecrantz + * Copyright (c) 2008 Red Hat Inc. + * Copyright (c) 2007-2008 Tungsten Graphics, Inc., Cedar Park, TX., USA + * Copyright (c) 2007-2008 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _DRM_MODE_H +#define _DRM_MODE_H + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/** + * DOC: overview + * + * DRM exposes many UAPI and structure definitions to have a consistent + * and standardized interface with users. + * Userspace can refer to these structure definitions and UAPI formats + * to communicate to drivers. + */ + +#define DRM_CONNECTOR_NAME_LEN 32 +#define DRM_DISPLAY_MODE_LEN 32 +#define DRM_PROP_NAME_LEN 32 + +#define DRM_MODE_TYPE_BUILTIN (1<<0) /* deprecated */ +#define DRM_MODE_TYPE_CLOCK_C ((1<<1) | DRM_MODE_TYPE_BUILTIN) /* deprecated */ +#define DRM_MODE_TYPE_CRTC_C ((1<<2) | DRM_MODE_TYPE_BUILTIN) /* deprecated */ +#define DRM_MODE_TYPE_PREFERRED (1<<3) +#define DRM_MODE_TYPE_DEFAULT (1<<4) /* deprecated */ +#define DRM_MODE_TYPE_USERDEF (1<<5) +#define DRM_MODE_TYPE_DRIVER (1<<6) + +#define DRM_MODE_TYPE_ALL (DRM_MODE_TYPE_PREFERRED | \ + DRM_MODE_TYPE_USERDEF | \ + DRM_MODE_TYPE_DRIVER) + +/* Video mode flags */ +/* bit compatible with the xrandr RR_ definitions (bits 0-13) + * + * ABI warning: Existing userspace really expects + * the mode flags to match the xrandr definitions. Any + * changes that don't match the xrandr definitions will + * likely need a new client cap or some other mechanism + * to avoid breaking existing userspace. This includes + * allocating new flags in the previously unused bits! + */ +#define DRM_MODE_FLAG_PHSYNC (1<<0) +#define DRM_MODE_FLAG_NHSYNC (1<<1) +#define DRM_MODE_FLAG_PVSYNC (1<<2) +#define DRM_MODE_FLAG_NVSYNC (1<<3) +#define DRM_MODE_FLAG_INTERLACE (1<<4) +#define DRM_MODE_FLAG_DBLSCAN (1<<5) +#define DRM_MODE_FLAG_CSYNC (1<<6) +#define DRM_MODE_FLAG_PCSYNC (1<<7) +#define DRM_MODE_FLAG_NCSYNC (1<<8) +#define DRM_MODE_FLAG_HSKEW (1<<9) /* hskew provided */ +#define DRM_MODE_FLAG_BCAST (1<<10) /* deprecated */ +#define DRM_MODE_FLAG_PIXMUX (1<<11) /* deprecated */ +#define DRM_MODE_FLAG_DBLCLK (1<<12) +#define DRM_MODE_FLAG_CLKDIV2 (1<<13) + /* + * When adding a new stereo mode don't forget to adjust DRM_MODE_FLAGS_3D_MAX + * (define not exposed to user space). + */ +#define DRM_MODE_FLAG_3D_MASK (0x1f<<14) +#define DRM_MODE_FLAG_3D_NONE (0<<14) +#define DRM_MODE_FLAG_3D_FRAME_PACKING (1<<14) +#define DRM_MODE_FLAG_3D_FIELD_ALTERNATIVE (2<<14) +#define DRM_MODE_FLAG_3D_LINE_ALTERNATIVE (3<<14) +#define DRM_MODE_FLAG_3D_SIDE_BY_SIDE_FULL (4<<14) +#define DRM_MODE_FLAG_3D_L_DEPTH (5<<14) +#define DRM_MODE_FLAG_3D_L_DEPTH_GFX_GFX_DEPTH (6<<14) +#define DRM_MODE_FLAG_3D_TOP_AND_BOTTOM (7<<14) +#define DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF (8<<14) + +/* Picture aspect ratio options */ +#define DRM_MODE_PICTURE_ASPECT_NONE 0 +#define DRM_MODE_PICTURE_ASPECT_4_3 1 +#define DRM_MODE_PICTURE_ASPECT_16_9 2 +#define DRM_MODE_PICTURE_ASPECT_64_27 3 +#define DRM_MODE_PICTURE_ASPECT_256_135 4 + +/* Content type options */ +#define DRM_MODE_CONTENT_TYPE_NO_DATA 0 +#define DRM_MODE_CONTENT_TYPE_GRAPHICS 1 +#define DRM_MODE_CONTENT_TYPE_PHOTO 2 +#define DRM_MODE_CONTENT_TYPE_CINEMA 3 +#define DRM_MODE_CONTENT_TYPE_GAME 4 + +/* Aspect ratio flag bitmask (4 bits 22:19) */ +#define DRM_MODE_FLAG_PIC_AR_MASK (0x0F<<19) +#define DRM_MODE_FLAG_PIC_AR_NONE \ + (DRM_MODE_PICTURE_ASPECT_NONE<<19) +#define DRM_MODE_FLAG_PIC_AR_4_3 \ + (DRM_MODE_PICTURE_ASPECT_4_3<<19) +#define DRM_MODE_FLAG_PIC_AR_16_9 \ + (DRM_MODE_PICTURE_ASPECT_16_9<<19) +#define DRM_MODE_FLAG_PIC_AR_64_27 \ + (DRM_MODE_PICTURE_ASPECT_64_27<<19) +#define DRM_MODE_FLAG_PIC_AR_256_135 \ + (DRM_MODE_PICTURE_ASPECT_256_135<<19) + +#define DRM_MODE_FLAG_ALL (DRM_MODE_FLAG_PHSYNC | \ + DRM_MODE_FLAG_NHSYNC | \ + DRM_MODE_FLAG_PVSYNC | \ + DRM_MODE_FLAG_NVSYNC | \ + DRM_MODE_FLAG_INTERLACE | \ + DRM_MODE_FLAG_DBLSCAN | \ + DRM_MODE_FLAG_CSYNC | \ + DRM_MODE_FLAG_PCSYNC | \ + DRM_MODE_FLAG_NCSYNC | \ + DRM_MODE_FLAG_HSKEW | \ + DRM_MODE_FLAG_DBLCLK | \ + DRM_MODE_FLAG_CLKDIV2 | \ + DRM_MODE_FLAG_3D_MASK) + +/* DPMS flags */ +/* bit compatible with the xorg definitions. */ +#define DRM_MODE_DPMS_ON 0 +#define DRM_MODE_DPMS_STANDBY 1 +#define DRM_MODE_DPMS_SUSPEND 2 +#define DRM_MODE_DPMS_OFF 3 + +/* Scaling mode options */ +#define DRM_MODE_SCALE_NONE 0 /* Unmodified timing (display or + software can still scale) */ +#define DRM_MODE_SCALE_FULLSCREEN 1 /* Full screen, ignore aspect */ +#define DRM_MODE_SCALE_CENTER 2 /* Centered, no scaling */ +#define DRM_MODE_SCALE_ASPECT 3 /* Full screen, preserve aspect */ + +/* Dithering mode options */ +#define DRM_MODE_DITHERING_OFF 0 +#define DRM_MODE_DITHERING_ON 1 +#define DRM_MODE_DITHERING_AUTO 2 + +/* Dirty info options */ +#define DRM_MODE_DIRTY_OFF 0 +#define DRM_MODE_DIRTY_ON 1 +#define DRM_MODE_DIRTY_ANNOTATE 2 + +/* Link Status options */ +#define DRM_MODE_LINK_STATUS_GOOD 0 +#define DRM_MODE_LINK_STATUS_BAD 1 + +/* + * DRM_MODE_ROTATE_ + * + * Signals that a drm plane is been rotated degrees in counter + * clockwise direction. + * + * This define is provided as a convenience, looking up the property id + * using the name->prop id lookup is the preferred method. + */ +#define DRM_MODE_ROTATE_0 (1<<0) +#define DRM_MODE_ROTATE_90 (1<<1) +#define DRM_MODE_ROTATE_180 (1<<2) +#define DRM_MODE_ROTATE_270 (1<<3) + +/* + * DRM_MODE_ROTATE_MASK + * + * Bitmask used to look for drm plane rotations. + */ +#define DRM_MODE_ROTATE_MASK (\ + DRM_MODE_ROTATE_0 | \ + DRM_MODE_ROTATE_90 | \ + DRM_MODE_ROTATE_180 | \ + DRM_MODE_ROTATE_270) + +/* + * DRM_MODE_REFLECT_ + * + * Signals that the contents of a drm plane is reflected along the axis, + * in the same way as mirroring. + * See kerneldoc chapter "Plane Composition Properties" for more details. + * + * This define is provided as a convenience, looking up the property id + * using the name->prop id lookup is the preferred method. + */ +#define DRM_MODE_REFLECT_X (1<<4) +#define DRM_MODE_REFLECT_Y (1<<5) + +/* + * DRM_MODE_REFLECT_MASK + * + * Bitmask used to look for drm plane reflections. + */ +#define DRM_MODE_REFLECT_MASK (\ + DRM_MODE_REFLECT_X | \ + DRM_MODE_REFLECT_Y) + +/* Content Protection Flags */ +#define DRM_MODE_CONTENT_PROTECTION_UNDESIRED 0 +#define DRM_MODE_CONTENT_PROTECTION_DESIRED 1 +#define DRM_MODE_CONTENT_PROTECTION_ENABLED 2 + +/** + * struct drm_mode_modeinfo - Display mode information. + * @clock: pixel clock in kHz + * @hdisplay: horizontal display size + * @hsync_start: horizontal sync start + * @hsync_end: horizontal sync end + * @htotal: horizontal total size + * @hskew: horizontal skew + * @vdisplay: vertical display size + * @vsync_start: vertical sync start + * @vsync_end: vertical sync end + * @vtotal: vertical total size + * @vscan: vertical scan + * @vrefresh: approximate vertical refresh rate in Hz + * @flags: bitmask of misc. flags, see DRM_MODE_FLAG_* defines + * @type: bitmask of type flags, see DRM_MODE_TYPE_* defines + * @name: string describing the mode resolution + * + * This is the user-space API display mode information structure. For the + * kernel version see struct drm_display_mode. + */ +struct drm_mode_modeinfo { + __u32 clock; + __u16 hdisplay; + __u16 hsync_start; + __u16 hsync_end; + __u16 htotal; + __u16 hskew; + __u16 vdisplay; + __u16 vsync_start; + __u16 vsync_end; + __u16 vtotal; + __u16 vscan; + + __u32 vrefresh; + + __u32 flags; + __u32 type; + char name[DRM_DISPLAY_MODE_LEN]; +}; + +struct drm_mode_card_res { + __u64 fb_id_ptr; + __u64 crtc_id_ptr; + __u64 connector_id_ptr; + __u64 encoder_id_ptr; + __u32 count_fbs; + __u32 count_crtcs; + __u32 count_connectors; + __u32 count_encoders; + __u32 min_width; + __u32 max_width; + __u32 min_height; + __u32 max_height; +}; + +struct drm_mode_crtc { + __u64 set_connectors_ptr; + __u32 count_connectors; + + __u32 crtc_id; /**< Id */ + __u32 fb_id; /**< Id of framebuffer */ + + __u32 x; /**< x Position on the framebuffer */ + __u32 y; /**< y Position on the framebuffer */ + + __u32 gamma_size; + __u32 mode_valid; + struct drm_mode_modeinfo mode; +}; + +#define DRM_MODE_PRESENT_TOP_FIELD (1<<0) +#define DRM_MODE_PRESENT_BOTTOM_FIELD (1<<1) + +/* Planes blend with or override other bits on the CRTC */ +struct drm_mode_set_plane { + __u32 plane_id; + __u32 crtc_id; + __u32 fb_id; /* fb object contains surface format type */ + __u32 flags; /* see above flags */ + + /* Signed dest location allows it to be partially off screen */ + __s32 crtc_x; + __s32 crtc_y; + __u32 crtc_w; + __u32 crtc_h; + + /* Source values are 16.16 fixed point */ + __u32 src_x; + __u32 src_y; + __u32 src_h; + __u32 src_w; +}; + +/** + * struct drm_mode_get_plane - Get plane metadata. + * + * Userspace can perform a GETPLANE ioctl to retrieve information about a + * plane. + * + * To retrieve the number of formats supported, set @count_format_types to zero + * and call the ioctl. @count_format_types will be updated with the value. + * + * To retrieve these formats, allocate an array with the memory needed to store + * @count_format_types formats. Point @format_type_ptr to this array and call + * the ioctl again (with @count_format_types still set to the value returned in + * the first ioctl call). + */ +struct drm_mode_get_plane { + /** + * @plane_id: Object ID of the plane whose information should be + * retrieved. Set by caller. + */ + __u32 plane_id; + + /** @crtc_id: Object ID of the current CRTC. */ + __u32 crtc_id; + /** @fb_id: Object ID of the current fb. */ + __u32 fb_id; + + /** + * @possible_crtcs: Bitmask of CRTC's compatible with the plane. CRTC's + * are created and they receive an index, which corresponds to their + * position in the bitmask. Bit N corresponds to + * :ref:`CRTC index` N. + */ + __u32 possible_crtcs; + /** @gamma_size: Never used. */ + __u32 gamma_size; + + /** @count_format_types: Number of formats. */ + __u32 count_format_types; + /** + * @format_type_ptr: Pointer to ``__u32`` array of formats that are + * supported by the plane. These formats do not require modifiers. + */ + __u64 format_type_ptr; +}; + +struct drm_mode_get_plane_res { + __u64 plane_id_ptr; + __u32 count_planes; +}; + +#define DRM_MODE_ENCODER_NONE 0 +#define DRM_MODE_ENCODER_DAC 1 +#define DRM_MODE_ENCODER_TMDS 2 +#define DRM_MODE_ENCODER_LVDS 3 +#define DRM_MODE_ENCODER_TVDAC 4 +#define DRM_MODE_ENCODER_VIRTUAL 5 +#define DRM_MODE_ENCODER_DSI 6 +#define DRM_MODE_ENCODER_DPMST 7 +#define DRM_MODE_ENCODER_DPI 8 + +struct drm_mode_get_encoder { + __u32 encoder_id; + __u32 encoder_type; + + __u32 crtc_id; /**< Id of crtc */ + + __u32 possible_crtcs; + __u32 possible_clones; +}; + +/* This is for connectors with multiple signal types. */ +/* Try to match DRM_MODE_CONNECTOR_X as closely as possible. */ +enum drm_mode_subconnector { + DRM_MODE_SUBCONNECTOR_Automatic = 0, /* DVI-I, TV */ + DRM_MODE_SUBCONNECTOR_Unknown = 0, /* DVI-I, TV, DP */ + DRM_MODE_SUBCONNECTOR_VGA = 1, /* DP */ + DRM_MODE_SUBCONNECTOR_DVID = 3, /* DVI-I DP */ + DRM_MODE_SUBCONNECTOR_DVIA = 4, /* DVI-I */ + DRM_MODE_SUBCONNECTOR_Composite = 5, /* TV */ + DRM_MODE_SUBCONNECTOR_SVIDEO = 6, /* TV */ + DRM_MODE_SUBCONNECTOR_Component = 8, /* TV */ + DRM_MODE_SUBCONNECTOR_SCART = 9, /* TV */ + DRM_MODE_SUBCONNECTOR_DisplayPort = 10, /* DP */ + DRM_MODE_SUBCONNECTOR_HDMIA = 11, /* DP */ + DRM_MODE_SUBCONNECTOR_Native = 15, /* DP */ + DRM_MODE_SUBCONNECTOR_Wireless = 18, /* DP */ +}; + +#define DRM_MODE_CONNECTOR_Unknown 0 +#define DRM_MODE_CONNECTOR_VGA 1 +#define DRM_MODE_CONNECTOR_DVII 2 +#define DRM_MODE_CONNECTOR_DVID 3 +#define DRM_MODE_CONNECTOR_DVIA 4 +#define DRM_MODE_CONNECTOR_Composite 5 +#define DRM_MODE_CONNECTOR_SVIDEO 6 +#define DRM_MODE_CONNECTOR_LVDS 7 +#define DRM_MODE_CONNECTOR_Component 8 +#define DRM_MODE_CONNECTOR_9PinDIN 9 +#define DRM_MODE_CONNECTOR_DisplayPort 10 +#define DRM_MODE_CONNECTOR_HDMIA 11 +#define DRM_MODE_CONNECTOR_HDMIB 12 +#define DRM_MODE_CONNECTOR_TV 13 +#define DRM_MODE_CONNECTOR_eDP 14 +#define DRM_MODE_CONNECTOR_VIRTUAL 15 +#define DRM_MODE_CONNECTOR_DSI 16 +#define DRM_MODE_CONNECTOR_DPI 17 +#define DRM_MODE_CONNECTOR_WRITEBACK 18 +#define DRM_MODE_CONNECTOR_SPI 19 +#define DRM_MODE_CONNECTOR_USB 20 + +/** + * struct drm_mode_get_connector - Get connector metadata. + * + * User-space can perform a GETCONNECTOR ioctl to retrieve information about a + * connector. User-space is expected to retrieve encoders, modes and properties + * by performing this ioctl at least twice: the first time to retrieve the + * number of elements, the second time to retrieve the elements themselves. + * + * To retrieve the number of elements, set @count_props and @count_encoders to + * zero, set @count_modes to 1, and set @modes_ptr to a temporary struct + * drm_mode_modeinfo element. + * + * To retrieve the elements, allocate arrays for @encoders_ptr, @modes_ptr, + * @props_ptr and @prop_values_ptr, then set @count_modes, @count_props and + * @count_encoders to their capacity. + * + * Performing the ioctl only twice may be racy: the number of elements may have + * changed with a hotplug event in-between the two ioctls. User-space is + * expected to retry the last ioctl until the number of elements stabilizes. + * The kernel won't fill any array which doesn't have the expected length. + * + * **Force-probing a connector** + * + * If the @count_modes field is set to zero and the DRM client is the current + * DRM master, the kernel will perform a forced probe on the connector to + * refresh the connector status, modes and EDID. A forced-probe can be slow, + * might cause flickering and the ioctl will block. + * + * User-space needs to force-probe connectors to ensure their metadata is + * up-to-date at startup and after receiving a hot-plug event. User-space + * may perform a forced-probe when the user explicitly requests it. User-space + * shouldn't perform a forced-probe in other situations. + */ +struct drm_mode_get_connector { + /** @encoders_ptr: Pointer to ``__u32`` array of object IDs. */ + __u64 encoders_ptr; + /** @modes_ptr: Pointer to struct drm_mode_modeinfo array. */ + __u64 modes_ptr; + /** @props_ptr: Pointer to ``__u32`` array of property IDs. */ + __u64 props_ptr; + /** @prop_values_ptr: Pointer to ``__u64`` array of property values. */ + __u64 prop_values_ptr; + + /** @count_modes: Number of modes. */ + __u32 count_modes; + /** @count_props: Number of properties. */ + __u32 count_props; + /** @count_encoders: Number of encoders. */ + __u32 count_encoders; + + /** @encoder_id: Object ID of the current encoder. */ + __u32 encoder_id; + /** @connector_id: Object ID of the connector. */ + __u32 connector_id; + /** + * @connector_type: Type of the connector. + * + * See DRM_MODE_CONNECTOR_* defines. + */ + __u32 connector_type; + /** + * @connector_type_id: Type-specific connector number. + * + * This is not an object ID. This is a per-type connector number. Each + * (type, type_id) combination is unique across all connectors of a DRM + * device. + * + * The (type, type_id) combination is not a stable identifier: the + * type_id can change depending on the driver probe order. + */ + __u32 connector_type_id; + + /** + * @connection: Status of the connector. + * + * See enum drm_connector_status. + */ + __u32 connection; + /** @mm_width: Width of the connected sink in millimeters. */ + __u32 mm_width; + /** @mm_height: Height of the connected sink in millimeters. */ + __u32 mm_height; + /** + * @subpixel: Subpixel order of the connected sink. + * + * See enum subpixel_order. + */ + __u32 subpixel; + + /** @pad: Padding, must be zero. */ + __u32 pad; +}; + +#define DRM_MODE_PROP_PENDING (1<<0) /* deprecated, do not use */ +#define DRM_MODE_PROP_RANGE (1<<1) +#define DRM_MODE_PROP_IMMUTABLE (1<<2) +#define DRM_MODE_PROP_ENUM (1<<3) /* enumerated type with text strings */ +#define DRM_MODE_PROP_BLOB (1<<4) +#define DRM_MODE_PROP_BITMASK (1<<5) /* bitmask of enumerated types */ + +/* non-extended types: legacy bitmask, one bit per type: */ +#define DRM_MODE_PROP_LEGACY_TYPE ( \ + DRM_MODE_PROP_RANGE | \ + DRM_MODE_PROP_ENUM | \ + DRM_MODE_PROP_BLOB | \ + DRM_MODE_PROP_BITMASK) + +/* extended-types: rather than continue to consume a bit per type, + * grab a chunk of the bits to use as integer type id. + */ +#define DRM_MODE_PROP_EXTENDED_TYPE 0x0000ffc0 +#define DRM_MODE_PROP_TYPE(n) ((n) << 6) +#define DRM_MODE_PROP_OBJECT DRM_MODE_PROP_TYPE(1) +#define DRM_MODE_PROP_SIGNED_RANGE DRM_MODE_PROP_TYPE(2) + +/* the PROP_ATOMIC flag is used to hide properties from userspace that + * is not aware of atomic properties. This is mostly to work around + * older userspace (DDX drivers) that read/write each prop they find, + * without being aware that this could be triggering a lengthy modeset. + */ +#define DRM_MODE_PROP_ATOMIC 0x80000000 + +/** + * struct drm_mode_property_enum - Description for an enum/bitfield entry. + * @value: numeric value for this enum entry. + * @name: symbolic name for this enum entry. + * + * See struct drm_property_enum for details. + */ +struct drm_mode_property_enum { + __u64 value; + char name[DRM_PROP_NAME_LEN]; +}; + +/** + * struct drm_mode_get_property - Get property metadata. + * + * User-space can perform a GETPROPERTY ioctl to retrieve information about a + * property. The same property may be attached to multiple objects, see + * "Modeset Base Object Abstraction". + * + * The meaning of the @values_ptr field changes depending on the property type. + * See &drm_property.flags for more details. + * + * The @enum_blob_ptr and @count_enum_blobs fields are only meaningful when the + * property has the type &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK. For + * backwards compatibility, the kernel will always set @count_enum_blobs to + * zero when the property has the type &DRM_MODE_PROP_BLOB. User-space must + * ignore these two fields if the property has a different type. + * + * User-space is expected to retrieve values and enums by performing this ioctl + * at least twice: the first time to retrieve the number of elements, the + * second time to retrieve the elements themselves. + * + * To retrieve the number of elements, set @count_values and @count_enum_blobs + * to zero, then call the ioctl. @count_values will be updated with the number + * of elements. If the property has the type &DRM_MODE_PROP_ENUM or + * &DRM_MODE_PROP_BITMASK, @count_enum_blobs will be updated as well. + * + * To retrieve the elements themselves, allocate an array for @values_ptr and + * set @count_values to its capacity. If the property has the type + * &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK, allocate an array for + * @enum_blob_ptr and set @count_enum_blobs to its capacity. Calling the ioctl + * again will fill the arrays. + */ +struct drm_mode_get_property { + /** @values_ptr: Pointer to a ``__u64`` array. */ + __u64 values_ptr; + /** @enum_blob_ptr: Pointer to a struct drm_mode_property_enum array. */ + __u64 enum_blob_ptr; + + /** + * @prop_id: Object ID of the property which should be retrieved. Set + * by the caller. + */ + __u32 prop_id; + /** + * @flags: ``DRM_MODE_PROP_*`` bitfield. See &drm_property.flags for + * a definition of the flags. + */ + __u32 flags; + /** + * @name: Symbolic property name. User-space should use this field to + * recognize properties. + */ + char name[DRM_PROP_NAME_LEN]; + + /** @count_values: Number of elements in @values_ptr. */ + __u32 count_values; + /** @count_enum_blobs: Number of elements in @enum_blob_ptr. */ + __u32 count_enum_blobs; +}; + +struct drm_mode_connector_set_property { + __u64 value; + __u32 prop_id; + __u32 connector_id; +}; + +#define DRM_MODE_OBJECT_CRTC 0xcccccccc +#define DRM_MODE_OBJECT_CONNECTOR 0xc0c0c0c0 +#define DRM_MODE_OBJECT_ENCODER 0xe0e0e0e0 +#define DRM_MODE_OBJECT_MODE 0xdededede +#define DRM_MODE_OBJECT_PROPERTY 0xb0b0b0b0 +#define DRM_MODE_OBJECT_FB 0xfbfbfbfb +#define DRM_MODE_OBJECT_BLOB 0xbbbbbbbb +#define DRM_MODE_OBJECT_PLANE 0xeeeeeeee +#define DRM_MODE_OBJECT_ANY 0 + +struct drm_mode_obj_get_properties { + __u64 props_ptr; + __u64 prop_values_ptr; + __u32 count_props; + __u32 obj_id; + __u32 obj_type; +}; + +struct drm_mode_obj_set_property { + __u64 value; + __u32 prop_id; + __u32 obj_id; + __u32 obj_type; +}; + +struct drm_mode_get_blob { + __u32 blob_id; + __u32 length; + __u64 data; +}; + +struct drm_mode_fb_cmd { + __u32 fb_id; + __u32 width; + __u32 height; + __u32 pitch; + __u32 bpp; + __u32 depth; + /* driver specific handle */ + __u32 handle; +}; + +#define DRM_MODE_FB_INTERLACED (1<<0) /* for interlaced framebuffers */ +#define DRM_MODE_FB_MODIFIERS (1<<1) /* enables ->modifier[] */ + +/** + * struct drm_mode_fb_cmd2 - Frame-buffer metadata. + * + * This struct holds frame-buffer metadata. There are two ways to use it: + * + * - User-space can fill this struct and perform a &DRM_IOCTL_MODE_ADDFB2 + * ioctl to register a new frame-buffer. The new frame-buffer object ID will + * be set by the kernel in @fb_id. + * - User-space can set @fb_id and perform a &DRM_IOCTL_MODE_GETFB2 ioctl to + * fetch metadata about an existing frame-buffer. + * + * In case of planar formats, this struct allows up to 4 buffer objects with + * offsets and pitches per plane. The pitch and offset order are dictated by + * the format FourCC as defined by ``drm_fourcc.h``, e.g. NV12 is described as: + * + * YUV 4:2:0 image with a plane of 8-bit Y samples followed by an + * interleaved U/V plane containing 8-bit 2x2 subsampled colour difference + * samples. + * + * So it would consist of a Y plane at ``offsets[0]`` and a UV plane at + * ``offsets[1]``. + * + * To accommodate tiled, compressed, etc formats, a modifier can be specified. + * For more information see the "Format Modifiers" section. Note that even + * though it looks like we have a modifier per-plane, we in fact do not. The + * modifier for each plane must be identical. Thus all combinations of + * different data layouts for multi-plane formats must be enumerated as + * separate modifiers. + * + * All of the entries in @handles, @pitches, @offsets and @modifier must be + * zero when unused. Warning, for @offsets and @modifier zero can't be used to + * figure out whether the entry is used or not since it's a valid value (a zero + * offset is common, and a zero modifier is &DRM_FORMAT_MOD_LINEAR). + */ +struct drm_mode_fb_cmd2 { + /** @fb_id: Object ID of the frame-buffer. */ + __u32 fb_id; + /** @width: Width of the frame-buffer. */ + __u32 width; + /** @height: Height of the frame-buffer. */ + __u32 height; + /** + * @pixel_format: FourCC format code, see ``DRM_FORMAT_*`` constants in + * ``drm_fourcc.h``. + */ + __u32 pixel_format; + /** + * @flags: Frame-buffer flags (see &DRM_MODE_FB_INTERLACED and + * &DRM_MODE_FB_MODIFIERS). + */ + __u32 flags; + + /** + * @handles: GEM buffer handle, one per plane. Set to 0 if the plane is + * unused. The same handle can be used for multiple planes. + */ + __u32 handles[4]; + /** @pitches: Pitch (aka. stride) in bytes, one per plane. */ + __u32 pitches[4]; + /** @offsets: Offset into the buffer in bytes, one per plane. */ + __u32 offsets[4]; + /** + * @modifier: Format modifier, one per plane. See ``DRM_FORMAT_MOD_*`` + * constants in ``drm_fourcc.h``. All planes must use the same + * modifier. Ignored unless &DRM_MODE_FB_MODIFIERS is set in @flags. + */ + __u64 modifier[4]; +}; + +#define DRM_MODE_FB_DIRTY_ANNOTATE_COPY 0x01 +#define DRM_MODE_FB_DIRTY_ANNOTATE_FILL 0x02 +#define DRM_MODE_FB_DIRTY_FLAGS 0x03 + +#define DRM_MODE_FB_DIRTY_MAX_CLIPS 256 + +/* + * Mark a region of a framebuffer as dirty. + * + * Some hardware does not automatically update display contents + * as a hardware or software draw to a framebuffer. This ioctl + * allows userspace to tell the kernel and the hardware what + * regions of the framebuffer have changed. + * + * The kernel or hardware is free to update more then just the + * region specified by the clip rects. The kernel or hardware + * may also delay and/or coalesce several calls to dirty into a + * single update. + * + * Userspace may annotate the updates, the annotates are a + * promise made by the caller that the change is either a copy + * of pixels or a fill of a single color in the region specified. + * + * If the DRM_MODE_FB_DIRTY_ANNOTATE_COPY flag is given then + * the number of updated regions are half of num_clips given, + * where the clip rects are paired in src and dst. The width and + * height of each one of the pairs must match. + * + * If the DRM_MODE_FB_DIRTY_ANNOTATE_FILL flag is given the caller + * promises that the region specified of the clip rects is filled + * completely with a single color as given in the color argument. + */ + +struct drm_mode_fb_dirty_cmd { + __u32 fb_id; + __u32 flags; + __u32 color; + __u32 num_clips; + __u64 clips_ptr; +}; + +struct drm_mode_mode_cmd { + __u32 connector_id; + struct drm_mode_modeinfo mode; +}; + +#define DRM_MODE_CURSOR_BO 0x01 +#define DRM_MODE_CURSOR_MOVE 0x02 +#define DRM_MODE_CURSOR_FLAGS 0x03 + +/* + * depending on the value in flags different members are used. + * + * CURSOR_BO uses + * crtc_id + * width + * height + * handle - if 0 turns the cursor off + * + * CURSOR_MOVE uses + * crtc_id + * x + * y + */ +struct drm_mode_cursor { + __u32 flags; + __u32 crtc_id; + __s32 x; + __s32 y; + __u32 width; + __u32 height; + /* driver specific handle */ + __u32 handle; +}; + +struct drm_mode_cursor2 { + __u32 flags; + __u32 crtc_id; + __s32 x; + __s32 y; + __u32 width; + __u32 height; + /* driver specific handle */ + __u32 handle; + __s32 hot_x; + __s32 hot_y; +}; + +struct drm_mode_crtc_lut { + __u32 crtc_id; + __u32 gamma_size; + + /* pointers to arrays */ + __u64 red; + __u64 green; + __u64 blue; +}; + +struct drm_color_ctm { + /* + * Conversion matrix in S31.32 sign-magnitude + * (not two's complement!) format. + * + * out matrix in + * |R| |0 1 2| |R| + * |G| = |3 4 5| x |G| + * |B| |6 7 8| |B| + */ + __u64 matrix[9]; +}; + +struct drm_color_lut { + /* + * Values are mapped linearly to 0.0 - 1.0 range, with 0x0 == 0.0 and + * 0xffff == 1.0. + */ + __u16 red; + __u16 green; + __u16 blue; + __u16 reserved; +}; + +/** + * struct drm_plane_size_hint - Plane size hints + * + * The plane SIZE_HINTS property blob contains an + * array of struct drm_plane_size_hint. + */ +struct drm_plane_size_hint { + __u16 width; + __u16 height; +}; + +/** + * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data. + * + * HDR Metadata Infoframe as per CTA 861.G spec. This is expected + * to match exactly with the spec. + * + * Userspace is expected to pass the metadata information as per + * the format described in this structure. + */ +struct hdr_metadata_infoframe { + /** + * @eotf: Electro-Optical Transfer Function (EOTF) + * used in the stream. + */ + __u8 eotf; + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ + __u8 metadata_type; + /** + * @display_primaries: Color Primaries of the Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @display_primaries.x: X coordinate of color primary. + * @display_primaries.y: Y coordinate of color primary. + */ + struct { + __u16 x, y; + } display_primaries[3]; + /** + * @white_point: White Point of Colorspace Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @white_point.x: X coordinate of whitepoint of color primary. + * @white_point.y: Y coordinate of whitepoint of color primary. + */ + struct { + __u16 x, y; + } white_point; + /** + * @max_display_mastering_luminance: Max Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_display_mastering_luminance; + /** + * @min_display_mastering_luminance: Min Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of + * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF + * represents 6.5535 cd/m2. + */ + __u16 min_display_mastering_luminance; + /** + * @max_cll: Max Content Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_cll; + /** + * @max_fall: Max Frame Average Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_fall; +}; + +/** + * struct hdr_output_metadata - HDR output metadata + * + * Metadata Information to be passed from userspace + */ +struct hdr_output_metadata { + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ + __u32 metadata_type; + /** + * @hdmi_metadata_type1: HDR Metadata Infoframe. + */ + union { + struct hdr_metadata_infoframe hdmi_metadata_type1; + }; +}; + +/** + * DRM_MODE_PAGE_FLIP_EVENT + * + * Request that the kernel sends back a vblank event (see + * struct drm_event_vblank) with the &DRM_EVENT_FLIP_COMPLETE type when the + * page-flip is done. + */ +#define DRM_MODE_PAGE_FLIP_EVENT 0x01 +/** + * DRM_MODE_PAGE_FLIP_ASYNC + * + * Request that the page-flip is performed as soon as possible, ie. with no + * delay due to waiting for vblank. This may cause tearing to be visible on + * the screen. + * + * When used with atomic uAPI, the driver will return an error if the hardware + * doesn't support performing an asynchronous page-flip for this update. + * User-space should handle this, e.g. by falling back to a regular page-flip. + * + * Note, some hardware might need to perform one last synchronous page-flip + * before being able to switch to asynchronous page-flips. As an exception, + * the driver will return success even though that first page-flip is not + * asynchronous. + */ +#define DRM_MODE_PAGE_FLIP_ASYNC 0x02 +#define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4 +#define DRM_MODE_PAGE_FLIP_TARGET_RELATIVE 0x8 +#define DRM_MODE_PAGE_FLIP_TARGET (DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE | \ + DRM_MODE_PAGE_FLIP_TARGET_RELATIVE) +/** + * DRM_MODE_PAGE_FLIP_FLAGS + * + * Bitmask of flags suitable for &drm_mode_crtc_page_flip_target.flags. + */ +#define DRM_MODE_PAGE_FLIP_FLAGS (DRM_MODE_PAGE_FLIP_EVENT | \ + DRM_MODE_PAGE_FLIP_ASYNC | \ + DRM_MODE_PAGE_FLIP_TARGET) + +/* + * Request a page flip on the specified crtc. + * + * This ioctl will ask KMS to schedule a page flip for the specified + * crtc. Once any pending rendering targeting the specified fb (as of + * ioctl time) has completed, the crtc will be reprogrammed to display + * that fb after the next vertical refresh. The ioctl returns + * immediately, but subsequent rendering to the current fb will block + * in the execbuffer ioctl until the page flip happens. If a page + * flip is already pending as the ioctl is called, EBUSY will be + * returned. + * + * Flag DRM_MODE_PAGE_FLIP_EVENT requests that drm sends back a vblank + * event (see drm.h: struct drm_event_vblank) when the page flip is + * done. The user_data field passed in with this ioctl will be + * returned as the user_data field in the vblank event struct. + * + * Flag DRM_MODE_PAGE_FLIP_ASYNC requests that the flip happen + * 'as soon as possible', meaning that it not delay waiting for vblank. + * This may cause tearing on the screen. + * + * The reserved field must be zero. + */ + +struct drm_mode_crtc_page_flip { + __u32 crtc_id; + __u32 fb_id; + __u32 flags; + __u32 reserved; + __u64 user_data; +}; + +/* + * Request a page flip on the specified crtc. + * + * Same as struct drm_mode_crtc_page_flip, but supports new flags and + * re-purposes the reserved field: + * + * The sequence field must be zero unless either of the + * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is specified. When + * the ABSOLUTE flag is specified, the sequence field denotes the absolute + * vblank sequence when the flip should take effect. When the RELATIVE + * flag is specified, the sequence field denotes the relative (to the + * current one when the ioctl is called) vblank sequence when the flip + * should take effect. NOTE: DRM_IOCTL_WAIT_VBLANK must still be used to + * make sure the vblank sequence before the target one has passed before + * calling this ioctl. The purpose of the + * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is merely to clarify + * the target for when code dealing with a page flip runs during a + * vertical blank period. + */ + +struct drm_mode_crtc_page_flip_target { + __u32 crtc_id; + __u32 fb_id; + __u32 flags; + __u32 sequence; + __u64 user_data; +}; + +/** + * struct drm_mode_create_dumb - Create a KMS dumb buffer for scanout. + * @height: buffer height in pixels + * @width: buffer width in pixels + * @bpp: bits per pixel + * @flags: must be zero + * @handle: buffer object handle + * @pitch: number of bytes between two consecutive lines + * @size: size of the whole buffer in bytes + * + * User-space fills @height, @width, @bpp and @flags. If the IOCTL succeeds, + * the kernel fills @handle, @pitch and @size. + */ +struct drm_mode_create_dumb { + __u32 height; + __u32 width; + __u32 bpp; + __u32 flags; + + __u32 handle; + __u32 pitch; + __u64 size; +}; + +/* set up for mmap of a dumb scanout buffer */ +struct drm_mode_map_dumb { + /** Handle for the object being mapped. */ + __u32 handle; + __u32 pad; + /** + * Fake offset to use for subsequent mmap call + * + * This is a fixed-size type for 32/64 compatibility. + */ + __u64 offset; +}; + +struct drm_mode_destroy_dumb { + __u32 handle; +}; + +/** + * DRM_MODE_ATOMIC_TEST_ONLY + * + * Do not apply the atomic commit, instead check whether the hardware supports + * this configuration. + * + * See &drm_mode_config_funcs.atomic_check for more details on test-only + * commits. + */ +#define DRM_MODE_ATOMIC_TEST_ONLY 0x0100 +/** + * DRM_MODE_ATOMIC_NONBLOCK + * + * Do not block while applying the atomic commit. The &DRM_IOCTL_MODE_ATOMIC + * IOCTL returns immediately instead of waiting for the changes to be applied + * in hardware. Note, the driver will still check that the update can be + * applied before retuning. + */ +#define DRM_MODE_ATOMIC_NONBLOCK 0x0200 +/** + * DRM_MODE_ATOMIC_ALLOW_MODESET + * + * Allow the update to result in temporary or transient visible artifacts while + * the update is being applied. Applying the update may also take significantly + * more time than a page flip. All visual artifacts will disappear by the time + * the update is completed, as signalled through the vblank event's timestamp + * (see struct drm_event_vblank). + * + * This flag must be set when the KMS update might cause visible artifacts. + * Without this flag such KMS update will return a EINVAL error. What kind of + * update may cause visible artifacts depends on the driver and the hardware. + * User-space that needs to know beforehand if an update might cause visible + * artifacts can use &DRM_MODE_ATOMIC_TEST_ONLY without + * &DRM_MODE_ATOMIC_ALLOW_MODESET to see if it fails. + * + * To the best of the driver's knowledge, visual artifacts are guaranteed to + * not appear when this flag is not set. Some sinks might display visual + * artifacts outside of the driver's control. + */ +#define DRM_MODE_ATOMIC_ALLOW_MODESET 0x0400 + +/** + * DRM_MODE_ATOMIC_FLAGS + * + * Bitfield of flags accepted by the &DRM_IOCTL_MODE_ATOMIC IOCTL in + * &drm_mode_atomic.flags. + */ +#define DRM_MODE_ATOMIC_FLAGS (\ + DRM_MODE_PAGE_FLIP_EVENT |\ + DRM_MODE_PAGE_FLIP_ASYNC |\ + DRM_MODE_ATOMIC_TEST_ONLY |\ + DRM_MODE_ATOMIC_NONBLOCK |\ + DRM_MODE_ATOMIC_ALLOW_MODESET) + +struct drm_mode_atomic { + __u32 flags; + __u32 count_objs; + __u64 objs_ptr; + __u64 count_props_ptr; + __u64 props_ptr; + __u64 prop_values_ptr; + __u64 reserved; + __u64 user_data; +}; + +struct drm_format_modifier_blob { +#define FORMAT_BLOB_CURRENT 1 + /* Version of this blob format */ + __u32 version; + + /* Flags */ + __u32 flags; + + /* Number of fourcc formats supported */ + __u32 count_formats; + + /* Where in this blob the formats exist (in bytes) */ + __u32 formats_offset; + + /* Number of drm_format_modifiers */ + __u32 count_modifiers; + + /* Where in this blob the modifiers exist (in bytes) */ + __u32 modifiers_offset; + + /* __u32 formats[] */ + /* struct drm_format_modifier modifiers[] */ +}; + +struct drm_format_modifier { + /* Bitmask of formats in get_plane format list this info applies to. The + * offset allows a sliding window of which 64 formats (bits). + * + * Some examples: + * In today's world with < 65 formats, and formats 0, and 2 are + * supported + * 0x0000000000000005 + * ^-offset = 0, formats = 5 + * + * If the number formats grew to 128, and formats 98-102 are + * supported with the modifier: + * + * 0x0000007c00000000 0000000000000000 + * ^ + * |__offset = 64, formats = 0x7c00000000 + * + */ + __u64 formats; + __u32 offset; + __u32 pad; + + /* The modifier that applies to the >get_plane format list bitmask. */ + __u64 modifier; +}; + +/** + * struct drm_mode_create_blob - Create New blob property + * + * Create a new 'blob' data property, copying length bytes from data pointer, + * and returning new blob ID. + */ +struct drm_mode_create_blob { + /** @data: Pointer to data to copy. */ + __u64 data; + /** @length: Length of data to copy. */ + __u32 length; + /** @blob_id: Return: new property ID. */ + __u32 blob_id; +}; + +/** + * struct drm_mode_destroy_blob - Destroy user blob + * @blob_id: blob_id to destroy + * + * Destroy a user-created blob property. + * + * User-space can release blobs as soon as they do not need to refer to them by + * their blob object ID. For instance, if you are using a MODE_ID blob in an + * atomic commit and you will not make another commit re-using the same ID, you + * can destroy the blob as soon as the commit has been issued, without waiting + * for it to complete. + */ +struct drm_mode_destroy_blob { + __u32 blob_id; +}; + +/** + * struct drm_mode_create_lease - Create lease + * + * Lease mode resources, creating another drm_master. + * + * The @object_ids array must reference at least one CRTC, one connector and + * one plane if &DRM_CLIENT_CAP_UNIVERSAL_PLANES is enabled. Alternatively, + * the lease can be completely empty. + */ +struct drm_mode_create_lease { + /** @object_ids: Pointer to array of object ids (__u32) */ + __u64 object_ids; + /** @object_count: Number of object ids */ + __u32 object_count; + /** @flags: flags for new FD (O_CLOEXEC, etc) */ + __u32 flags; + + /** @lessee_id: Return: unique identifier for lessee. */ + __u32 lessee_id; + /** @fd: Return: file descriptor to new drm_master file */ + __u32 fd; +}; + +/** + * struct drm_mode_list_lessees - List lessees + * + * List lesses from a drm_master. + */ +struct drm_mode_list_lessees { + /** + * @count_lessees: Number of lessees. + * + * On input, provides length of the array. + * On output, provides total number. No + * more than the input number will be written + * back, so two calls can be used to get + * the size and then the data. + */ + __u32 count_lessees; + /** @pad: Padding. */ + __u32 pad; + + /** + * @lessees_ptr: Pointer to lessees. + * + * Pointer to __u64 array of lessee ids + */ + __u64 lessees_ptr; +}; + +/** + * struct drm_mode_get_lease - Get Lease + * + * Get leased objects. + */ +struct drm_mode_get_lease { + /** + * @count_objects: Number of leased objects. + * + * On input, provides length of the array. + * On output, provides total number. No + * more than the input number will be written + * back, so two calls can be used to get + * the size and then the data. + */ + __u32 count_objects; + /** @pad: Padding. */ + __u32 pad; + + /** + * @objects_ptr: Pointer to objects. + * + * Pointer to __u32 array of object ids. + */ + __u64 objects_ptr; +}; + +/** + * struct drm_mode_revoke_lease - Revoke lease + */ +struct drm_mode_revoke_lease { + /** @lessee_id: Unique ID of lessee */ + __u32 lessee_id; +}; + +/** + * struct drm_mode_rect - Two dimensional rectangle. + * @x1: Horizontal starting coordinate (inclusive). + * @y1: Vertical starting coordinate (inclusive). + * @x2: Horizontal ending coordinate (exclusive). + * @y2: Vertical ending coordinate (exclusive). + * + * With drm subsystem using struct drm_rect to manage rectangular area this + * export it to user-space. + * + * Currently used by drm_mode_atomic blob property FB_DAMAGE_CLIPS. + */ +struct drm_mode_rect { + __s32 x1; + __s32 y1; + __s32 x2; + __s32 y2; +}; + +/** + * struct drm_mode_closefb + * @fb_id: Framebuffer ID. + * @pad: Must be zero. + */ +struct drm_mode_closefb { + __u32 fb_id; + __u32 pad; +}; + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/xf86drm.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/xf86drm.h new file mode 100644 index 0000000000..1bc6e2233e --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/drm/xf86drm.h @@ -0,0 +1,983 @@ +/** + * \file xf86drm.h + * OS-independent header for DRM user-level library interface. + * + * \author Rickard E. (Rik) Faith + */ + +/* + * Copyright 1999, 2000 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef _XF86DRM_H_ +#define _XF86DRM_H_ + +#include +#include +#include +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifndef DRM_MAX_MINOR +#define DRM_MAX_MINOR 64 /* deprecated */ +#endif + +#if defined(__linux__) + +#define DRM_IOCTL_NR(n) _IOC_NR(n) +#define DRM_IOC_VOID _IOC_NONE +#define DRM_IOC_READ _IOC_READ +#define DRM_IOC_WRITE _IOC_WRITE +#define DRM_IOC_READWRITE _IOC_READ|_IOC_WRITE +#define DRM_IOC(dir, group, nr, size) _IOC(dir, group, nr, size) + +#else /* One of the *BSDs */ + +#include +#define DRM_IOCTL_NR(n) ((n) & 0xff) +#define DRM_IOC_VOID IOC_VOID +#define DRM_IOC_READ IOC_OUT +#define DRM_IOC_WRITE IOC_IN +#define DRM_IOC_READWRITE IOC_INOUT +#define DRM_IOC(dir, group, nr, size) _IOC(dir, group, nr, size) + +#endif + + /* Defaults, if nothing set in xf86config */ +#define DRM_DEV_UID 0 +#define DRM_DEV_GID 0 +/* Default /dev/dri directory permissions 0755 */ +#define DRM_DEV_DIRMODE \ + (S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH) +#define DRM_DEV_MODE (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP) + +#ifdef __OpenBSD__ +#define DRM_DIR_NAME "/dev" +#define DRM_PRIMARY_MINOR_NAME "drm" +#define DRM_CONTROL_MINOR_NAME "drmC" /* deprecated */ +#define DRM_RENDER_MINOR_NAME "drmR" +#else +#define DRM_DIR_NAME "/dev/dri" +#define DRM_PRIMARY_MINOR_NAME "card" +#define DRM_CONTROL_MINOR_NAME "controlD" /* deprecated */ +#define DRM_RENDER_MINOR_NAME "renderD" +#define DRM_PROC_NAME "/proc/dri/" /* For backward Linux compatibility */ +#endif + +#define DRM_DEV_NAME "%s/" DRM_PRIMARY_MINOR_NAME "%d" +#define DRM_CONTROL_DEV_NAME "%s/" DRM_CONTROL_MINOR_NAME "%d" /* deprecated */ +#define DRM_RENDER_DEV_NAME "%s/" DRM_RENDER_MINOR_NAME "%d" + +#define DRM_NODE_NAME_MAX \ + (sizeof(DRM_DIR_NAME) + 1 /* slash */ \ + + MAX3(sizeof(DRM_PRIMARY_MINOR_NAME), \ + sizeof(DRM_CONTROL_MINOR_NAME), \ + sizeof(DRM_RENDER_MINOR_NAME)) \ + + sizeof("1048575") /* highest possible node number 2^MINORBITS - 1 */ \ + + 1) /* NULL-terminator */ + +#define DRM_ERR_NO_DEVICE (-1001) +#define DRM_ERR_NO_ACCESS (-1002) +#define DRM_ERR_NOT_ROOT (-1003) +#define DRM_ERR_INVALID (-1004) +#define DRM_ERR_NO_FD (-1005) + +#define DRM_AGP_NO_HANDLE 0 + +typedef unsigned int drmSize, *drmSizePtr; /**< For mapped regions */ +typedef void *drmAddress, **drmAddressPtr; /**< For mapped regions */ + +#if (__GNUC__ >= 3) +#define DRM_PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a))) +#else +#define DRM_PRINTFLIKE(f, a) +#endif + +typedef struct _drmServerInfo { + int (*debug_print)(const char *format, va_list ap) DRM_PRINTFLIKE(1,0); + int (*load_module)(const char *name); + void (*get_perms)(gid_t *, mode_t *); +} drmServerInfo, *drmServerInfoPtr; + +typedef struct drmHashEntry { + int fd; + void (*f)(int, void *, void *); + void *tagTable; +} drmHashEntry; + +extern int drmIoctl(int fd, unsigned long request, void *arg); +extern void *drmGetHashTable(void); +extern drmHashEntry *drmGetEntry(int fd); + +/** + * Driver version information. + * + * \sa drmGetVersion() and drmSetVersion(). + */ +typedef struct _drmVersion { + int version_major; /**< Major version */ + int version_minor; /**< Minor version */ + int version_patchlevel; /**< Patch level */ + int name_len; /**< Length of name buffer */ + char *name; /**< Name of driver */ + int date_len; /**< Length of date buffer */ + char *date; /**< User-space buffer to hold date */ + int desc_len; /**< Length of desc buffer */ + char *desc; /**< User-space buffer to hold desc */ +} drmVersion, *drmVersionPtr; + +typedef struct _drmStats { + unsigned long count; /**< Number of data */ + struct { + unsigned long value; /**< Value from kernel */ + const char *long_format; /**< Suggested format for long_name */ + const char *long_name; /**< Long name for value */ + const char *rate_format; /**< Suggested format for rate_name */ + const char *rate_name; /**< Short name for value per second */ + int isvalue; /**< True if value (vs. counter) */ + const char *mult_names; /**< Multiplier names (e.g., "KGM") */ + int mult; /**< Multiplier value (e.g., 1024) */ + int verbose; /**< Suggest only in verbose output */ + } data[15]; +} drmStatsT; + + + /* All of these enums *MUST* match with the + kernel implementation -- so do *NOT* + change them! (The drmlib implementation + will just copy the flags instead of + translating them.) */ +typedef enum { + DRM_FRAME_BUFFER = 0, /**< WC, no caching, no core dump */ + DRM_REGISTERS = 1, /**< no caching, no core dump */ + DRM_SHM = 2, /**< shared, cached */ + DRM_AGP = 3, /**< AGP/GART */ + DRM_SCATTER_GATHER = 4, /**< PCI scatter/gather */ + DRM_CONSISTENT = 5 /**< PCI consistent */ +} drmMapType; + +typedef enum { + DRM_RESTRICTED = 0x0001, /**< Cannot be mapped to client-virtual */ + DRM_READ_ONLY = 0x0002, /**< Read-only in client-virtual */ + DRM_LOCKED = 0x0004, /**< Physical pages locked */ + DRM_KERNEL = 0x0008, /**< Kernel requires access */ + DRM_WRITE_COMBINING = 0x0010, /**< Use write-combining, if available */ + DRM_CONTAINS_LOCK = 0x0020, /**< SHM page that contains lock */ + DRM_REMOVABLE = 0x0040 /**< Removable mapping */ +} drmMapFlags; + +/** + * \warning These values *MUST* match drm.h + */ +typedef enum { + /** \name Flags for DMA buffer dispatch */ + /*@{*/ + DRM_DMA_BLOCK = 0x01, /**< + * Block until buffer dispatched. + * + * \note the buffer may not yet have been + * processed by the hardware -- getting a + * hardware lock with the hardware quiescent + * will ensure that the buffer has been + * processed. + */ + DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */ + DRM_DMA_PRIORITY = 0x04, /**< High priority dispatch */ + /*@}*/ + + /** \name Flags for DMA buffer request */ + /*@{*/ + DRM_DMA_WAIT = 0x10, /**< Wait for free buffers */ + DRM_DMA_SMALLER_OK = 0x20, /**< Smaller-than-requested buffers OK */ + DRM_DMA_LARGER_OK = 0x40 /**< Larger-than-requested buffers OK */ + /*@}*/ +} drmDMAFlags; + +typedef enum { + DRM_PAGE_ALIGN = 0x01, + DRM_AGP_BUFFER = 0x02, + DRM_SG_BUFFER = 0x04, + DRM_FB_BUFFER = 0x08, + DRM_PCI_BUFFER_RO = 0x10 +} drmBufDescFlags; + +typedef enum { + DRM_LOCK_READY = 0x01, /**< Wait until hardware is ready for DMA */ + DRM_LOCK_QUIESCENT = 0x02, /**< Wait until hardware quiescent */ + DRM_LOCK_FLUSH = 0x04, /**< Flush this context's DMA queue first */ + DRM_LOCK_FLUSH_ALL = 0x08, /**< Flush all DMA queues first */ + /* These *HALT* flags aren't supported yet + -- they will be used to support the + full-screen DGA-like mode. */ + DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */ + DRM_HALT_CUR_QUEUES = 0x20 /**< Halt all current queues */ +} drmLockFlags; + +typedef enum { + DRM_CONTEXT_PRESERVED = 0x01, /**< This context is preserved and + never swapped. */ + DRM_CONTEXT_2DONLY = 0x02 /**< This context is for 2D rendering only. */ +} drm_context_tFlags, *drm_context_tFlagsPtr; + +typedef struct _drmBufDesc { + int count; /**< Number of buffers of this size */ + int size; /**< Size in bytes */ + int low_mark; /**< Low water mark */ + int high_mark; /**< High water mark */ +} drmBufDesc, *drmBufDescPtr; + +typedef struct _drmBufInfo { + int count; /**< Number of buffers described in list */ + drmBufDescPtr list; /**< List of buffer descriptions */ +} drmBufInfo, *drmBufInfoPtr; + +typedef struct _drmBuf { + int idx; /**< Index into the master buffer list */ + int total; /**< Buffer size */ + int used; /**< Amount of buffer in use (for DMA) */ + drmAddress address; /**< Address */ +} drmBuf, *drmBufPtr; + +/** + * Buffer mapping information. + * + * Used by drmMapBufs() and drmUnmapBufs() to store information about the + * mapped buffers. + */ +typedef struct _drmBufMap { + int count; /**< Number of buffers mapped */ + drmBufPtr list; /**< Buffers */ +} drmBufMap, *drmBufMapPtr; + +typedef struct _drmLock { + volatile unsigned int lock; + char padding[60]; + /* This is big enough for most current (and future?) architectures: + DEC Alpha: 32 bytes + Intel Merced: ? + Intel P5/PPro/PII/PIII: 32 bytes + Intel StrongARM: 32 bytes + Intel i386/i486: 16 bytes + MIPS: 32 bytes (?) + Motorola 68k: 16 bytes + Motorola PowerPC: 32 bytes + Sun SPARC: 32 bytes + */ +} drmLock, *drmLockPtr; + +/** + * Indices here refer to the offset into + * list in drmBufInfo + */ +typedef struct _drmDMAReq { + drm_context_t context; /**< Context handle */ + int send_count; /**< Number of buffers to send */ + int *send_list; /**< List of handles to buffers */ + int *send_sizes; /**< Lengths of data to send, in bytes */ + drmDMAFlags flags; /**< Flags */ + int request_count; /**< Number of buffers requested */ + int request_size; /**< Desired size of buffers requested */ + int *request_list; /**< Buffer information */ + int *request_sizes; /**< Minimum acceptable sizes */ + int granted_count; /**< Number of buffers granted at this size */ +} drmDMAReq, *drmDMAReqPtr; + +typedef struct _drmRegion { + drm_handle_t handle; + unsigned int offset; + drmSize size; + drmAddress map; +} drmRegion, *drmRegionPtr; + +typedef struct _drmTextureRegion { + unsigned char next; + unsigned char prev; + unsigned char in_use; + unsigned char padding; /**< Explicitly pad this out */ + unsigned int age; +} drmTextureRegion, *drmTextureRegionPtr; + + +typedef enum { + DRM_VBLANK_ABSOLUTE = 0x0, /**< Wait for specific vblank sequence number */ + DRM_VBLANK_RELATIVE = 0x1, /**< Wait for given number of vblanks */ + /* bits 1-6 are reserved for high crtcs */ + DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e, + DRM_VBLANK_EVENT = 0x4000000, /**< Send event instead of blocking */ + DRM_VBLANK_FLIP = 0x8000000, /**< Scheduled buffer swap should flip */ + DRM_VBLANK_NEXTONMISS = 0x10000000, /**< If missed, wait for next vblank */ + DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ + DRM_VBLANK_SIGNAL = 0x40000000 /* Send signal instead of blocking */ +} drmVBlankSeqType; +#define DRM_VBLANK_HIGH_CRTC_SHIFT 1 + +typedef struct _drmVBlankReq { + drmVBlankSeqType type; + unsigned int sequence; + unsigned long signal; +} drmVBlankReq, *drmVBlankReqPtr; + +typedef struct _drmVBlankReply { + drmVBlankSeqType type; + unsigned int sequence; + long tval_sec; + long tval_usec; +} drmVBlankReply, *drmVBlankReplyPtr; + +typedef union _drmVBlank { + drmVBlankReq request; + drmVBlankReply reply; +} drmVBlank, *drmVBlankPtr; + +typedef struct _drmSetVersion { + int drm_di_major; + int drm_di_minor; + int drm_dd_major; + int drm_dd_minor; +} drmSetVersion, *drmSetVersionPtr; + +#define __drm_dummy_lock(lock) (*(__volatile__ unsigned int *)lock) + +#define DRM_LOCK_HELD 0x80000000U /**< Hardware lock is held */ +#define DRM_LOCK_CONT 0x40000000U /**< Hardware lock is contended */ + +#if defined(__GNUC__) && (__GNUC__ >= 2) +# if defined(__i386) || defined(__AMD64__) || defined(__x86_64__) || defined(__amd64__) + /* Reflect changes here to drmP.h */ +#define DRM_CAS(lock,old,new,__ret) \ + do { \ + int __dummy; /* Can't mark eax as clobbered */ \ + __asm__ __volatile__( \ + "lock ; cmpxchg %4,%1\n\t" \ + "setnz %0" \ + : "=d" (__ret), \ + "=m" (__drm_dummy_lock(lock)), \ + "=a" (__dummy) \ + : "2" (old), \ + "r" (new)); \ + } while (0) + +#elif defined(__alpha__) + +#define DRM_CAS(lock, old, new, ret) \ + do { \ + int tmp, old32; \ + __asm__ __volatile__( \ + " addl $31, %5, %3\n" \ + "1: ldl_l %0, %2\n" \ + " cmpeq %0, %3, %1\n" \ + " beq %1, 2f\n" \ + " mov %4, %0\n" \ + " stl_c %0, %2\n" \ + " beq %0, 3f\n" \ + " mb\n" \ + "2: cmpeq %1, 0, %1\n" \ + ".subsection 2\n" \ + "3: br 1b\n" \ + ".previous" \ + : "=&r"(tmp), "=&r"(ret), \ + "=m"(__drm_dummy_lock(lock)), \ + "=&r"(old32) \ + : "r"(new), "r"(old) \ + : "memory"); \ + } while (0) + +#elif defined(__sparc__) + +#define DRM_CAS(lock,old,new,__ret) \ +do { register unsigned int __old __asm("o0"); \ + register unsigned int __new __asm("o1"); \ + register volatile unsigned int *__lock __asm("o2"); \ + __old = old; \ + __new = new; \ + __lock = (volatile unsigned int *)lock; \ + __asm__ __volatile__( \ + /*"cas [%2], %3, %0"*/ \ + ".word 0xd3e29008\n\t" \ + /*"membar #StoreStore | #StoreLoad"*/ \ + ".word 0x8143e00a" \ + : "=&r" (__new) \ + : "0" (__new), \ + "r" (__lock), \ + "r" (__old) \ + : "memory"); \ + __ret = (__new != __old); \ +} while(0) + +#elif defined(__ia64__) + +#ifdef __INTEL_COMPILER +/* this currently generates bad code (missing stop bits)... */ +#include + +#define DRM_CAS(lock,old,new,__ret) \ + do { \ + unsigned long __result, __old = (old) & 0xffffffff; \ + __mf(); \ + __result = _InterlockedCompareExchange_acq(&__drm_dummy_lock(lock), (new), __old);\ + __ret = (__result) != (__old); \ +/* __ret = (__sync_val_compare_and_swap(&__drm_dummy_lock(lock), \ + (old), (new)) \ + != (old)); */\ + } while (0) + +#else +#define DRM_CAS(lock,old,new,__ret) \ + do { \ + unsigned int __result, __old = (old); \ + __asm__ __volatile__( \ + "mf\n" \ + "mov ar.ccv=%2\n" \ + ";;\n" \ + "cmpxchg4.acq %0=%1,%3,ar.ccv" \ + : "=r" (__result), "=m" (__drm_dummy_lock(lock)) \ + : "r" ((unsigned long)__old), "r" (new) \ + : "memory"); \ + __ret = (__result) != (__old); \ + } while (0) + +#endif + +#elif defined(__powerpc__) + +#define DRM_CAS(lock,old,new,__ret) \ + do { \ + __asm__ __volatile__( \ + "sync;" \ + "0: lwarx %0,0,%1;" \ + " xor. %0,%3,%0;" \ + " bne 1f;" \ + " stwcx. %2,0,%1;" \ + " bne- 0b;" \ + "1: " \ + "sync;" \ + : "=&r"(__ret) \ + : "r"(lock), "r"(new), "r"(old) \ + : "cr0", "memory"); \ + } while (0) + +# elif defined (__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined (__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ + || defined (__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ + || defined (__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__) + /* excluding ARMv4/ARMv5 and lower (lacking ldrex/strex support) */ + #undef DRM_DEV_MODE + #define DRM_DEV_MODE (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH) + + #define DRM_CAS(lock,old,new,__ret) \ + do { \ + __asm__ __volatile__ ( \ + "1: ldrex %0, [%1]\n" \ + " teq %0, %2\n" \ + " ite eq\n" \ + " strexeq %0, %3, [%1]\n" \ + " movne %0, #1\n" \ + : "=&r" (__ret) \ + : "r" (lock), "r" (old), "r" (new) \ + : "cc","memory"); \ + } while (0) + +#endif /* architecture */ +#endif /* __GNUC__ >= 2 */ + +#ifndef DRM_CAS +#define DRM_CAS(lock,old,new,ret) do { ret=1; } while (0) /* FAST LOCK FAILS */ +#endif + +#if defined(__alpha__) +#define DRM_CAS_RESULT(_result) long _result +#elif defined(__powerpc__) +#define DRM_CAS_RESULT(_result) int _result +#else +#define DRM_CAS_RESULT(_result) char _result +#endif + +#define DRM_LIGHT_LOCK(fd,lock,context) \ + do { \ + DRM_CAS_RESULT(__ret); \ + DRM_CAS(lock,context,DRM_LOCK_HELD|context,__ret); \ + if (__ret) drmGetLock(fd,context,0); \ + } while(0) + + /* This one counts fast locks -- for + benchmarking only. */ +#define DRM_LIGHT_LOCK_COUNT(fd,lock,context,count) \ + do { \ + DRM_CAS_RESULT(__ret); \ + DRM_CAS(lock,context,DRM_LOCK_HELD|context,__ret); \ + if (__ret) drmGetLock(fd,context,0); \ + else ++count; \ + } while(0) + +#define DRM_LOCK(fd,lock,context,flags) \ + do { \ + if (flags) drmGetLock(fd,context,flags); \ + else DRM_LIGHT_LOCK(fd,lock,context); \ + } while(0) + +#define DRM_UNLOCK(fd,lock,context) \ + do { \ + DRM_CAS_RESULT(__ret); \ + DRM_CAS(lock,DRM_LOCK_HELD|context,context,__ret); \ + if (__ret) drmUnlock(fd,context); \ + } while(0) + + /* Simple spin locks */ +#define DRM_SPINLOCK(spin,val) \ + do { \ + DRM_CAS_RESULT(__ret); \ + do { \ + DRM_CAS(spin,0,val,__ret); \ + if (__ret) while ((spin)->lock); \ + } while (__ret); \ + } while(0) + +#define DRM_SPINLOCK_TAKE(spin,val) \ + do { \ + DRM_CAS_RESULT(__ret); \ + int cur; \ + do { \ + cur = (*spin).lock; \ + DRM_CAS(spin,cur,val,__ret); \ + } while (__ret); \ + } while(0) + +#define DRM_SPINLOCK_COUNT(spin,val,count,__ret) \ + do { \ + int __i; \ + __ret = 1; \ + for (__i = 0; __ret && __i < count; __i++) { \ + DRM_CAS(spin,0,val,__ret); \ + if (__ret) for (;__i < count && (spin)->lock; __i++); \ + } \ + } while(0) + +#define DRM_SPINUNLOCK(spin,val) \ + do { \ + DRM_CAS_RESULT(__ret); \ + if ((*spin).lock == val) { /* else server stole lock */ \ + do { \ + DRM_CAS(spin,val,0,__ret); \ + } while (__ret); \ + } \ + } while(0) + + + +/* General user-level programmer's API: unprivileged */ +extern int drmAvailable(void); +extern int drmOpen(const char *name, const char *busid); + +#define DRM_NODE_PRIMARY 0 +#define DRM_NODE_CONTROL 1 /* deprecated: never returned */ +#define DRM_NODE_RENDER 2 +#define DRM_NODE_MAX 3 + +extern int drmOpenWithType(const char *name, const char *busid, + int type); + +extern int drmOpenControl(int minor); /* deprecated: always fails */ +extern int drmOpenRender(int minor); +extern int drmClose(int fd); +extern drmVersionPtr drmGetVersion(int fd); +extern drmVersionPtr drmGetLibVersion(int fd); +extern int drmGetCap(int fd, uint64_t capability, uint64_t *value); +extern void drmFreeVersion(drmVersionPtr); +extern int drmGetMagic(int fd, drm_magic_t * magic); +extern char *drmGetBusid(int fd); +extern int drmGetInterruptFromBusID(int fd, int busnum, int devnum, + int funcnum); +extern int drmGetMap(int fd, int idx, drm_handle_t *offset, + drmSize *size, drmMapType *type, + drmMapFlags *flags, drm_handle_t *handle, + int *mtrr); +extern int drmGetClient(int fd, int idx, int *auth, int *pid, + int *uid, unsigned long *magic, + unsigned long *iocs); +extern int drmGetStats(int fd, drmStatsT *stats); +extern int drmSetInterfaceVersion(int fd, drmSetVersion *version); +extern int drmCommandNone(int fd, unsigned long drmCommandIndex); +extern int drmCommandRead(int fd, unsigned long drmCommandIndex, + void *data, unsigned long size); +extern int drmCommandWrite(int fd, unsigned long drmCommandIndex, + void *data, unsigned long size); +extern int drmCommandWriteRead(int fd, unsigned long drmCommandIndex, + void *data, unsigned long size); + +/* General user-level programmer's API: X server (root) only */ +extern void drmFreeBusid(const char *busid); +extern int drmSetBusid(int fd, const char *busid); +extern int drmAuthMagic(int fd, drm_magic_t magic); +extern int drmAddMap(int fd, + drm_handle_t offset, + drmSize size, + drmMapType type, + drmMapFlags flags, + drm_handle_t * handle); +extern int drmRmMap(int fd, drm_handle_t handle); +extern int drmAddContextPrivateMapping(int fd, drm_context_t ctx_id, + drm_handle_t handle); + +extern int drmAddBufs(int fd, int count, int size, + drmBufDescFlags flags, + int agp_offset); +extern int drmMarkBufs(int fd, double low, double high); +extern int drmCreateContext(int fd, drm_context_t * handle); +extern int drmSetContextFlags(int fd, drm_context_t context, + drm_context_tFlags flags); +extern int drmGetContextFlags(int fd, drm_context_t context, + drm_context_tFlagsPtr flags); +extern int drmAddContextTag(int fd, drm_context_t context, void *tag); +extern int drmDelContextTag(int fd, drm_context_t context); +extern void *drmGetContextTag(int fd, drm_context_t context); +extern drm_context_t * drmGetReservedContextList(int fd, int *count); +extern void drmFreeReservedContextList(drm_context_t *); +extern int drmSwitchToContext(int fd, drm_context_t context); +extern int drmDestroyContext(int fd, drm_context_t handle); +extern int drmCreateDrawable(int fd, drm_drawable_t * handle); +extern int drmDestroyDrawable(int fd, drm_drawable_t handle); +extern int drmUpdateDrawableInfo(int fd, drm_drawable_t handle, + drm_drawable_info_type_t type, + unsigned int num, void *data); +extern int drmCtlInstHandler(int fd, int irq); +extern int drmCtlUninstHandler(int fd); +extern int drmSetClientCap(int fd, uint64_t capability, + uint64_t value); + +extern int drmCrtcGetSequence(int fd, uint32_t crtcId, + uint64_t *sequence, uint64_t *ns); +extern int drmCrtcQueueSequence(int fd, uint32_t crtcId, + uint32_t flags, uint64_t sequence, + uint64_t *sequence_queued, + uint64_t user_data); +/* General user-level programmer's API: authenticated client and/or X */ +extern int drmMap(int fd, + drm_handle_t handle, + drmSize size, + drmAddressPtr address); +extern int drmUnmap(drmAddress address, drmSize size); +extern drmBufInfoPtr drmGetBufInfo(int fd); +extern drmBufMapPtr drmMapBufs(int fd); +extern int drmUnmapBufs(drmBufMapPtr bufs); +extern int drmDMA(int fd, drmDMAReqPtr request); +extern int drmFreeBufs(int fd, int count, int *list); +extern int drmGetLock(int fd, + drm_context_t context, + drmLockFlags flags); +extern int drmUnlock(int fd, drm_context_t context); +extern int drmFinish(int fd, int context, drmLockFlags flags); +extern int drmGetContextPrivateMapping(int fd, drm_context_t ctx_id, + drm_handle_t * handle); + +/* AGP/GART support: X server (root) only */ +extern int drmAgpAcquire(int fd); +extern int drmAgpRelease(int fd); +extern int drmAgpEnable(int fd, unsigned long mode); +extern int drmAgpAlloc(int fd, unsigned long size, + unsigned long type, unsigned long *address, + drm_handle_t *handle); +extern int drmAgpFree(int fd, drm_handle_t handle); +extern int drmAgpBind(int fd, drm_handle_t handle, + unsigned long offset); +extern int drmAgpUnbind(int fd, drm_handle_t handle); + +/* AGP/GART info: authenticated client and/or X */ +extern int drmAgpVersionMajor(int fd); +extern int drmAgpVersionMinor(int fd); +extern unsigned long drmAgpGetMode(int fd); +extern unsigned long drmAgpBase(int fd); /* Physical location */ +extern unsigned long drmAgpSize(int fd); /* Bytes */ +extern unsigned long drmAgpMemoryUsed(int fd); +extern unsigned long drmAgpMemoryAvail(int fd); +extern unsigned int drmAgpVendorId(int fd); +extern unsigned int drmAgpDeviceId(int fd); + +/* PCI scatter/gather support: X server (root) only */ +extern int drmScatterGatherAlloc(int fd, unsigned long size, + drm_handle_t *handle); +extern int drmScatterGatherFree(int fd, drm_handle_t handle); + +extern int drmWaitVBlank(int fd, drmVBlankPtr vbl); + +/* Support routines */ +extern void drmSetServerInfo(drmServerInfoPtr info); +extern int drmError(int err, const char *label); +extern void *drmMalloc(int size); +extern void drmFree(void *pt); + +/* Hash table routines */ +extern void *drmHashCreate(void); +extern int drmHashDestroy(void *t); +extern int drmHashLookup(void *t, unsigned long key, void **value); +extern int drmHashInsert(void *t, unsigned long key, void *value); +extern int drmHashDelete(void *t, unsigned long key); +extern int drmHashFirst(void *t, unsigned long *key, void **value); +extern int drmHashNext(void *t, unsigned long *key, void **value); + +/* PRNG routines */ +extern void *drmRandomCreate(unsigned long seed); +extern int drmRandomDestroy(void *state); +extern unsigned long drmRandom(void *state); +extern double drmRandomDouble(void *state); + +/* Skip list routines */ + +extern void *drmSLCreate(void); +extern int drmSLDestroy(void *l); +extern int drmSLLookup(void *l, unsigned long key, void **value); +extern int drmSLInsert(void *l, unsigned long key, void *value); +extern int drmSLDelete(void *l, unsigned long key); +extern int drmSLNext(void *l, unsigned long *key, void **value); +extern int drmSLFirst(void *l, unsigned long *key, void **value); +extern void drmSLDump(void *l); +extern int drmSLLookupNeighbors(void *l, unsigned long key, + unsigned long *prev_key, void **prev_value, + unsigned long *next_key, void **next_value); + +extern int drmOpenOnce(void *unused, const char *BusID, int *newlyopened); +extern int drmOpenOnceWithType(const char *BusID, int *newlyopened, int type); +extern void drmCloseOnce(int fd); +extern void drmMsg(const char *format, ...) DRM_PRINTFLIKE(1, 2); + +extern int drmSetMaster(int fd); +extern int drmDropMaster(int fd); +extern int drmIsMaster(int fd); + +#define DRM_EVENT_CONTEXT_VERSION 4 + +typedef struct _drmEventContext { + + /* This struct is versioned so we can add more pointers if we + * add more events. */ + int version; + + void (*vblank_handler)(int fd, + unsigned int sequence, + unsigned int tv_sec, + unsigned int tv_usec, + void *user_data); + + void (*page_flip_handler)(int fd, + unsigned int sequence, + unsigned int tv_sec, + unsigned int tv_usec, + void *user_data); + + void (*page_flip_handler2)(int fd, + unsigned int sequence, + unsigned int tv_sec, + unsigned int tv_usec, + unsigned int crtc_id, + void *user_data); + + void (*sequence_handler)(int fd, + uint64_t sequence, + uint64_t ns, + uint64_t user_data); +} drmEventContext, *drmEventContextPtr; + +extern int drmHandleEvent(int fd, drmEventContextPtr evctx); + +extern char *drmGetDeviceNameFromFd(int fd); + +/* Improved version of drmGetDeviceNameFromFd which attributes for any type of + * device/node - card or renderD. + */ +extern char *drmGetDeviceNameFromFd2(int fd); +extern int drmGetNodeTypeFromFd(int fd); + +/* Convert between GEM handles and DMA-BUF file descriptors. + * + * Warning: since GEM handles are not reference-counted and are unique per + * DRM file description, the caller is expected to perform its own reference + * counting. drmPrimeFDToHandle is guaranteed to return the same handle for + * different FDs if they reference the same underlying buffer object. This + * could even be a buffer object originally created on the same DRM FD. + * + * When sharing a DRM FD with an API such as EGL or GBM, the caller must not + * use drmPrimeHandleToFD nor drmPrimeFDToHandle. A single user-space + * reference-counting implementation is necessary to avoid double-closing GEM + * handles. + * + * Two processes can't share the same DRM FD and both use it to create or + * import GEM handles, even when using a single user-space reference-counting + * implementation like GBM, because GBM doesn't share its state between + * processes. + */ +extern int drmPrimeHandleToFD(int fd, uint32_t handle, uint32_t flags, int *prime_fd); +extern int drmPrimeFDToHandle(int fd, int prime_fd, uint32_t *handle); + +extern int drmCloseBufferHandle(int fd, uint32_t handle); + +extern char *drmGetPrimaryDeviceNameFromFd(int fd); +extern char *drmGetRenderDeviceNameFromFd(int fd); + +#define DRM_BUS_PCI 0 +#define DRM_BUS_USB 1 +#define DRM_BUS_PLATFORM 2 +#define DRM_BUS_HOST1X 3 + +typedef struct _drmPciBusInfo { + uint16_t domain; + uint8_t bus; + uint8_t dev; + uint8_t func; +} drmPciBusInfo, *drmPciBusInfoPtr; + +typedef struct _drmPciDeviceInfo { + uint16_t vendor_id; + uint16_t device_id; + uint16_t subvendor_id; + uint16_t subdevice_id; + uint8_t revision_id; +} drmPciDeviceInfo, *drmPciDeviceInfoPtr; + +typedef struct _drmUsbBusInfo { + uint8_t bus; + uint8_t dev; +} drmUsbBusInfo, *drmUsbBusInfoPtr; + +typedef struct _drmUsbDeviceInfo { + uint16_t vendor; + uint16_t product; +} drmUsbDeviceInfo, *drmUsbDeviceInfoPtr; + +#define DRM_PLATFORM_DEVICE_NAME_LEN 512 + +typedef struct _drmPlatformBusInfo { + char fullname[DRM_PLATFORM_DEVICE_NAME_LEN]; +} drmPlatformBusInfo, *drmPlatformBusInfoPtr; + +typedef struct _drmPlatformDeviceInfo { + char **compatible; /* NULL terminated list of compatible strings */ +} drmPlatformDeviceInfo, *drmPlatformDeviceInfoPtr; + +#define DRM_HOST1X_DEVICE_NAME_LEN 512 + +typedef struct _drmHost1xBusInfo { + char fullname[DRM_HOST1X_DEVICE_NAME_LEN]; +} drmHost1xBusInfo, *drmHost1xBusInfoPtr; + +typedef struct _drmHost1xDeviceInfo { + char **compatible; /* NULL terminated list of compatible strings */ +} drmHost1xDeviceInfo, *drmHost1xDeviceInfoPtr; + +typedef struct _drmDevice { + char **nodes; /* DRM_NODE_MAX sized array */ + int available_nodes; /* DRM_NODE_* bitmask */ + int bustype; + union { + drmPciBusInfoPtr pci; + drmUsbBusInfoPtr usb; + drmPlatformBusInfoPtr platform; + drmHost1xBusInfoPtr host1x; + } businfo; + union { + drmPciDeviceInfoPtr pci; + drmUsbDeviceInfoPtr usb; + drmPlatformDeviceInfoPtr platform; + drmHost1xDeviceInfoPtr host1x; + } deviceinfo; +} drmDevice, *drmDevicePtr; + +extern int drmGetDevice(int fd, drmDevicePtr *device); +extern void drmFreeDevice(drmDevicePtr *device); + +extern int drmGetDevices(drmDevicePtr devices[], int max_devices); +extern void drmFreeDevices(drmDevicePtr devices[], int count); + +#define DRM_DEVICE_GET_PCI_REVISION (1 << 0) +extern int drmGetDevice2(int fd, uint32_t flags, drmDevicePtr *device); +extern int drmGetDevices2(uint32_t flags, drmDevicePtr devices[], int max_devices); + +extern int drmGetDeviceFromDevId(dev_t dev_id, uint32_t flags, drmDevicePtr *device); + +/** + * Get the node type (DRM_NODE_PRIMARY or DRM_NODE_RENDER) from a device ID. + * + * Returns negative errno on error. + */ +extern int drmGetNodeTypeFromDevId(dev_t devid); + +/** + * Check if two drmDevice pointers represent the same DRM device. + * + * Returns 1 if the devices are equal, 0 otherwise. + */ +extern int drmDevicesEqual(drmDevicePtr a, drmDevicePtr b); + +extern int drmSyncobjCreate(int fd, uint32_t flags, uint32_t *handle); +extern int drmSyncobjDestroy(int fd, uint32_t handle); +extern int drmSyncobjHandleToFD(int fd, uint32_t handle, int *obj_fd); +extern int drmSyncobjFDToHandle(int fd, int obj_fd, uint32_t *handle); + +extern int drmSyncobjImportSyncFile(int fd, uint32_t handle, int sync_file_fd); +extern int drmSyncobjExportSyncFile(int fd, uint32_t handle, int *sync_file_fd); +extern int drmSyncobjWait(int fd, uint32_t *handles, unsigned num_handles, + int64_t timeout_nsec, unsigned flags, + uint32_t *first_signaled); +extern int drmSyncobjReset(int fd, const uint32_t *handles, uint32_t handle_count); +extern int drmSyncobjSignal(int fd, const uint32_t *handles, uint32_t handle_count); +extern int drmSyncobjTimelineSignal(int fd, const uint32_t *handles, + uint64_t *points, uint32_t handle_count); +extern int drmSyncobjTimelineWait(int fd, uint32_t *handles, uint64_t *points, + unsigned num_handles, + int64_t timeout_nsec, unsigned flags, + uint32_t *first_signaled); +extern int drmSyncobjQuery(int fd, uint32_t *handles, uint64_t *points, + uint32_t handle_count); +extern int drmSyncobjQuery2(int fd, uint32_t *handles, uint64_t *points, + uint32_t handle_count, uint32_t flags); +extern int drmSyncobjTransfer(int fd, + uint32_t dst_handle, uint64_t dst_point, + uint32_t src_handle, uint64_t src_point, + uint32_t flags); +extern int drmSyncobjEventfd(int fd, uint32_t handle, uint64_t point, int ev_fd, + uint32_t flags); + +extern char * +drmGetFormatModifierVendor(uint64_t modifier); + +extern char * +drmGetFormatModifierName(uint64_t modifier); + +extern char * +drmGetFormatName(uint32_t format); + +#ifndef fourcc_mod_get_vendor +#define fourcc_mod_get_vendor(modifier) \ + (((modifier) >> 56) & 0xff) +#endif + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h index 88ab70ae93..e19441b5a8 100644 --- a/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h +++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt.h @@ -398,6 +398,12 @@ hsaKmtGetQueueInfo( HsaQueueInfo *QueueInfo //IN ); +HSAKMT_STATUS +HSAKMTAPI +hsaKmtQueueRingDoorbell( + HSA_QUEUEID QueueId +); + /** Allows an HSA process to set/change the default and alternate memory coherency, before starting to dispatch. */ diff --git a/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt_drm.h b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt_drm.h new file mode 100644 index 0000000000..af9658cf6d --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/hsakmt/hsakmt_drm.h @@ -0,0 +1,33 @@ +/* + * Copyright © 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HSAKMT_DRM_H_ +#define _HSAKMT_DRM_H_ + +#include "drm/xf86drm.h" +#include "drm/amdgpu.h" +#include "drm/amdgpu_drm.h" + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/Brig.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/Brig.h new file mode 100644 index 0000000000..4f34bd1d50 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/Brig.h @@ -0,0 +1,1131 @@ +// University of Illinois/NCSA +// Open Source License +// +// Copyright (c) 2013-2015, Advanced Micro Devices, Inc. +// All rights reserved. +// +// Developed by: +// +// HSA Team +// +// Advanced Micro Devices, Inc +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal with +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimers in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the names of the LLVM Team, University of Illinois at +// Urbana-Champaign, nor the names of its contributors may be used to +// endorse or promote products derived from this Software without specific +// prior written permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +// SOFTWARE. + +#ifndef INCLUDED_BRIG_H +#define INCLUDED_BRIG_H + +#include /* size_t */ +#include /* uintXX_t */ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/*========================================================================================*/ +/* =======================================================================================*/ +/* =======================================================================================*/ +/* =======================================================================================*/ + +typedef uint32_t BrigCodeOffset32_t; +typedef uint32_t BrigOperandOffset32_t; +typedef uint32_t BrigDataOffset32_t; + +typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t; +typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t; +typedef BrigDataOffset32_t BrigDataOffsetString32_t; + +typedef uint32_t BrigVersion32_t; +enum BrigVersion { + BRIG_VERSION_HSAIL_MAJOR = 1, + BRIG_VERSION_HSAIL_MINOR = 0, + BRIG_VERSION_BRIG_MAJOR = 1, + BRIG_VERSION_BRIG_MINOR = 0 +}; + +typedef uint16_t BrigKind16_t; +enum BrigKind { + BRIG_KIND_NONE = 0x0000, + + BRIG_KIND_DIRECTIVE_BEGIN = 0x1000, + BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000, + BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001, + BRIG_KIND_DIRECTIVE_COMMENT = 0x1002, + BRIG_KIND_DIRECTIVE_CONTROL = 0x1003, + BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004, + BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005, + BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006, + BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007, + BRIG_KIND_DIRECTIVE_KERNEL = 0x1008, + BRIG_KIND_DIRECTIVE_LABEL = 0x1009, + BRIG_KIND_DIRECTIVE_LOC = 0x100a, + BRIG_KIND_DIRECTIVE_MODULE = 0x100b, + BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c, + BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d, + BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e, + BRIG_KIND_DIRECTIVE_END = 0x100f, + + BRIG_KIND_INST_BEGIN = 0x2000, + BRIG_KIND_INST_ADDR = 0x2000, + BRIG_KIND_INST_ATOMIC = 0x2001, + BRIG_KIND_INST_BASIC = 0x2002, + BRIG_KIND_INST_BR = 0x2003, + BRIG_KIND_INST_CMP = 0x2004, + BRIG_KIND_INST_CVT = 0x2005, + BRIG_KIND_INST_IMAGE = 0x2006, + BRIG_KIND_INST_LANE = 0x2007, + BRIG_KIND_INST_MEM = 0x2008, + BRIG_KIND_INST_MEM_FENCE = 0x2009, + BRIG_KIND_INST_MOD = 0x200a, + BRIG_KIND_INST_QUERY_IMAGE = 0x200b, + BRIG_KIND_INST_QUERY_SAMPLER = 0x200c, + BRIG_KIND_INST_QUEUE = 0x200d, + BRIG_KIND_INST_SEG = 0x200e, + BRIG_KIND_INST_SEG_CVT = 0x200f, + BRIG_KIND_INST_SIGNAL = 0x2010, + BRIG_KIND_INST_SOURCE_TYPE = 0x2011, + BRIG_KIND_INST_END = 0x2012, + + BRIG_KIND_OPERAND_BEGIN = 0x3000, + BRIG_KIND_OPERAND_ADDRESS = 0x3000, + BRIG_KIND_OPERAND_ALIGN = 0x3001, + BRIG_KIND_OPERAND_CODE_LIST = 0x3002, + BRIG_KIND_OPERAND_CODE_REF = 0x3003, + BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004, + BRIG_KIND_OPERAND_RESERVED = 0x3005, + BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006, + BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007, + BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008, + BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009, + BRIG_KIND_OPERAND_REGISTER = 0x300a, + BRIG_KIND_OPERAND_STRING = 0x300b, + BRIG_KIND_OPERAND_WAVESIZE = 0x300c, + BRIG_KIND_OPERAND_END = 0x300d +}; + +typedef uint8_t BrigAlignment8_t; +enum BrigAlignment { + BRIG_ALIGNMENT_NONE = 0, + BRIG_ALIGNMENT_1 = 1, + BRIG_ALIGNMENT_2 = 2, + BRIG_ALIGNMENT_4 = 3, + BRIG_ALIGNMENT_8 = 4, + BRIG_ALIGNMENT_16 = 5, + BRIG_ALIGNMENT_32 = 6, + BRIG_ALIGNMENT_64 = 7, + BRIG_ALIGNMENT_128 = 8, + BRIG_ALIGNMENT_256 = 9, + BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_256 +}; + +typedef uint8_t BrigAllocation8_t; +enum BrigAllocation { + BRIG_ALLOCATION_NONE = 0, + BRIG_ALLOCATION_PROGRAM = 1, + BRIG_ALLOCATION_AGENT = 2, + BRIG_ALLOCATION_AUTOMATIC = 3 +}; + +typedef uint8_t BrigAluModifier8_t; +enum BrigAluModifierMask { + BRIG_ALU_FTZ = 1 +}; + +typedef uint8_t BrigAtomicOperation8_t; +enum BrigAtomicOperation { + BRIG_ATOMIC_ADD = 0, + BRIG_ATOMIC_AND = 1, + BRIG_ATOMIC_CAS = 2, + BRIG_ATOMIC_EXCH = 3, + BRIG_ATOMIC_LD = 4, + BRIG_ATOMIC_MAX = 5, + BRIG_ATOMIC_MIN = 6, + BRIG_ATOMIC_OR = 7, + BRIG_ATOMIC_ST = 8, + BRIG_ATOMIC_SUB = 9, + BRIG_ATOMIC_WRAPDEC = 10, + BRIG_ATOMIC_WRAPINC = 11, + BRIG_ATOMIC_XOR = 12, + BRIG_ATOMIC_WAIT_EQ = 13, + BRIG_ATOMIC_WAIT_NE = 14, + BRIG_ATOMIC_WAIT_LT = 15, + BRIG_ATOMIC_WAIT_GTE = 16, + BRIG_ATOMIC_WAITTIMEOUT_EQ = 17, + BRIG_ATOMIC_WAITTIMEOUT_NE = 18, + BRIG_ATOMIC_WAITTIMEOUT_LT = 19, + BRIG_ATOMIC_WAITTIMEOUT_GTE = 20 +}; + +typedef uint8_t BrigCompareOperation8_t; +enum BrigCompareOperation { + BRIG_COMPARE_EQ = 0, + BRIG_COMPARE_NE = 1, + BRIG_COMPARE_LT = 2, + BRIG_COMPARE_LE = 3, + BRIG_COMPARE_GT = 4, + BRIG_COMPARE_GE = 5, + BRIG_COMPARE_EQU = 6, + BRIG_COMPARE_NEU = 7, + BRIG_COMPARE_LTU = 8, + BRIG_COMPARE_LEU = 9, + BRIG_COMPARE_GTU = 10, + BRIG_COMPARE_GEU = 11, + BRIG_COMPARE_NUM = 12, + BRIG_COMPARE_NAN = 13, + BRIG_COMPARE_SEQ = 14, + BRIG_COMPARE_SNE = 15, + BRIG_COMPARE_SLT = 16, + BRIG_COMPARE_SLE = 17, + BRIG_COMPARE_SGT = 18, + BRIG_COMPARE_SGE = 19, + BRIG_COMPARE_SGEU = 20, + BRIG_COMPARE_SEQU = 21, + BRIG_COMPARE_SNEU = 22, + BRIG_COMPARE_SLTU = 23, + BRIG_COMPARE_SLEU = 24, + BRIG_COMPARE_SNUM = 25, + BRIG_COMPARE_SNAN = 26, + BRIG_COMPARE_SGTU = 27 +}; + +typedef uint16_t BrigControlDirective16_t; +enum BrigControlDirective { + BRIG_CONTROL_NONE = 0, + BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1, + BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2, + BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3, + BRIG_CONTROL_MAXFLATGRIDSIZE = 4, + BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5, + BRIG_CONTROL_REQUIREDDIM = 6, + BRIG_CONTROL_REQUIREDGRIDSIZE = 7, + BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8, + BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9 +}; + +typedef uint8_t BrigExecutableModifier8_t; +enum BrigExecutableModifierMask { + BRIG_EXECUTABLE_DEFINITION = 1 +}; + +typedef uint8_t BrigImageChannelOrder8_t; +enum BrigImageChannelOrder { + BRIG_CHANNEL_ORDER_A = 0, + BRIG_CHANNEL_ORDER_R = 1, + BRIG_CHANNEL_ORDER_RX = 2, + BRIG_CHANNEL_ORDER_RG = 3, + BRIG_CHANNEL_ORDER_RGX = 4, + BRIG_CHANNEL_ORDER_RA = 5, + BRIG_CHANNEL_ORDER_RGB = 6, + BRIG_CHANNEL_ORDER_RGBX = 7, + BRIG_CHANNEL_ORDER_RGBA = 8, + BRIG_CHANNEL_ORDER_BGRA = 9, + BRIG_CHANNEL_ORDER_ARGB = 10, + BRIG_CHANNEL_ORDER_ABGR = 11, + BRIG_CHANNEL_ORDER_SRGB = 12, + BRIG_CHANNEL_ORDER_SRGBX = 13, + BRIG_CHANNEL_ORDER_SRGBA = 14, + BRIG_CHANNEL_ORDER_SBGRA = 15, + BRIG_CHANNEL_ORDER_INTENSITY = 16, + BRIG_CHANNEL_ORDER_LUMINANCE = 17, + BRIG_CHANNEL_ORDER_DEPTH = 18, + BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19, + + BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigImageChannelType8_t; +enum BrigImageChannelType { + BRIG_CHANNEL_TYPE_SNORM_INT8 = 0, + BRIG_CHANNEL_TYPE_SNORM_INT16 = 1, + BRIG_CHANNEL_TYPE_UNORM_INT8 = 2, + BRIG_CHANNEL_TYPE_UNORM_INT16 = 3, + BRIG_CHANNEL_TYPE_UNORM_INT24 = 4, + BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7, + BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8, + BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9, + BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10, + BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + BRIG_CHANNEL_TYPE_HALF_FLOAT = 14, + BRIG_CHANNEL_TYPE_FLOAT = 15, + + BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigImageGeometry8_t; +enum BrigImageGeometry { + BRIG_GEOMETRY_1D = 0, + BRIG_GEOMETRY_2D = 1, + BRIG_GEOMETRY_3D = 2, + BRIG_GEOMETRY_1DA = 3, + BRIG_GEOMETRY_2DA = 4, + BRIG_GEOMETRY_1DB = 5, + BRIG_GEOMETRY_2DDEPTH = 6, + BRIG_GEOMETRY_2DADEPTH = 7, + + BRIG_GEOMETRY_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigImageQuery8_t; +enum BrigImageQuery { + BRIG_IMAGE_QUERY_WIDTH = 0, + BRIG_IMAGE_QUERY_HEIGHT = 1, + BRIG_IMAGE_QUERY_DEPTH = 2, + BRIG_IMAGE_QUERY_ARRAY = 3, + BRIG_IMAGE_QUERY_CHANNELORDER = 4, + BRIG_IMAGE_QUERY_CHANNELTYPE = 5, + + BRIG_IMAGE_QUERY_FIRST_USER_DEFINED = 6 +}; + +typedef uint8_t BrigLinkage8_t; +enum BrigLinkage { + BRIG_LINKAGE_NONE = 0, + BRIG_LINKAGE_PROGRAM = 1, + BRIG_LINKAGE_MODULE = 2, + BRIG_LINKAGE_FUNCTION = 3, + BRIG_LINKAGE_ARG = 4 +}; + +typedef uint8_t BrigMachineModel8_t; +enum BrigMachineModel { + BRIG_MACHINE_SMALL = 0, + BRIG_MACHINE_LARGE = 1, +}; + +typedef uint8_t BrigMemoryModifier8_t; +enum BrigMemoryModifierMask { + BRIG_MEMORY_CONST = 1 +}; + +typedef uint8_t BrigMemoryOrder8_t; +enum BrigMemoryOrder { + BRIG_MEMORY_ORDER_NONE = 0, + BRIG_MEMORY_ORDER_RELAXED = 1, + BRIG_MEMORY_ORDER_SC_ACQUIRE = 2, + BRIG_MEMORY_ORDER_SC_RELEASE = 3, + BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4, +}; + +typedef uint8_t BrigMemoryScope8_t; +enum BrigMemoryScope { + BRIG_MEMORY_SCOPE_NONE = 0, + BRIG_MEMORY_SCOPE_WORKITEM = 1, + BRIG_MEMORY_SCOPE_WAVEFRONT = 2, + BRIG_MEMORY_SCOPE_WORKGROUP = 3, + BRIG_MEMORY_SCOPE_AGENT = 4, + BRIG_MEMORY_SCOPE_SYSTEM = 5, +}; + +typedef uint16_t BrigOpcode16_t; +enum BrigOpcode { + BRIG_OPCODE_NOP = 0, + BRIG_OPCODE_ABS = 1, + BRIG_OPCODE_ADD = 2, + BRIG_OPCODE_BORROW = 3, + BRIG_OPCODE_CARRY = 4, + BRIG_OPCODE_CEIL = 5, + BRIG_OPCODE_COPYSIGN = 6, + BRIG_OPCODE_DIV = 7, + BRIG_OPCODE_FLOOR = 8, + BRIG_OPCODE_FMA = 9, + BRIG_OPCODE_FRACT = 10, + BRIG_OPCODE_MAD = 11, + BRIG_OPCODE_MAX = 12, + BRIG_OPCODE_MIN = 13, + BRIG_OPCODE_MUL = 14, + BRIG_OPCODE_MULHI = 15, + BRIG_OPCODE_NEG = 16, + BRIG_OPCODE_REM = 17, + BRIG_OPCODE_RINT = 18, + BRIG_OPCODE_SQRT = 19, + BRIG_OPCODE_SUB = 20, + BRIG_OPCODE_TRUNC = 21, + BRIG_OPCODE_MAD24 = 22, + BRIG_OPCODE_MAD24HI = 23, + BRIG_OPCODE_MUL24 = 24, + BRIG_OPCODE_MUL24HI = 25, + BRIG_OPCODE_SHL = 26, + BRIG_OPCODE_SHR = 27, + BRIG_OPCODE_AND = 28, + BRIG_OPCODE_NOT = 29, + BRIG_OPCODE_OR = 30, + BRIG_OPCODE_POPCOUNT = 31, + BRIG_OPCODE_XOR = 32, + BRIG_OPCODE_BITEXTRACT = 33, + BRIG_OPCODE_BITINSERT = 34, + BRIG_OPCODE_BITMASK = 35, + BRIG_OPCODE_BITREV = 36, + BRIG_OPCODE_BITSELECT = 37, + BRIG_OPCODE_FIRSTBIT = 38, + BRIG_OPCODE_LASTBIT = 39, + BRIG_OPCODE_COMBINE = 40, + BRIG_OPCODE_EXPAND = 41, + BRIG_OPCODE_LDA = 42, + BRIG_OPCODE_MOV = 43, + BRIG_OPCODE_SHUFFLE = 44, + BRIG_OPCODE_UNPACKHI = 45, + BRIG_OPCODE_UNPACKLO = 46, + BRIG_OPCODE_PACK = 47, + BRIG_OPCODE_UNPACK = 48, + BRIG_OPCODE_CMOV = 49, + BRIG_OPCODE_CLASS = 50, + BRIG_OPCODE_NCOS = 51, + BRIG_OPCODE_NEXP2 = 52, + BRIG_OPCODE_NFMA = 53, + BRIG_OPCODE_NLOG2 = 54, + BRIG_OPCODE_NRCP = 55, + BRIG_OPCODE_NRSQRT = 56, + BRIG_OPCODE_NSIN = 57, + BRIG_OPCODE_NSQRT = 58, + BRIG_OPCODE_BITALIGN = 59, + BRIG_OPCODE_BYTEALIGN = 60, + BRIG_OPCODE_PACKCVT = 61, + BRIG_OPCODE_UNPACKCVT = 62, + BRIG_OPCODE_LERP = 63, + BRIG_OPCODE_SAD = 64, + BRIG_OPCODE_SADHI = 65, + BRIG_OPCODE_SEGMENTP = 66, + BRIG_OPCODE_FTOS = 67, + BRIG_OPCODE_STOF = 68, + BRIG_OPCODE_CMP = 69, + BRIG_OPCODE_CVT = 70, + BRIG_OPCODE_LD = 71, + BRIG_OPCODE_ST = 72, + BRIG_OPCODE_ATOMIC = 73, + BRIG_OPCODE_ATOMICNORET = 74, + BRIG_OPCODE_SIGNAL = 75, + BRIG_OPCODE_SIGNALNORET = 76, + BRIG_OPCODE_MEMFENCE = 77, + BRIG_OPCODE_RDIMAGE = 78, + BRIG_OPCODE_LDIMAGE = 79, + BRIG_OPCODE_STIMAGE = 80, + BRIG_OPCODE_IMAGEFENCE = 81, + BRIG_OPCODE_QUERYIMAGE = 82, + BRIG_OPCODE_QUERYSAMPLER = 83, + BRIG_OPCODE_CBR = 84, + BRIG_OPCODE_BR = 85, + BRIG_OPCODE_SBR = 86, + BRIG_OPCODE_BARRIER = 87, + BRIG_OPCODE_WAVEBARRIER = 88, + BRIG_OPCODE_ARRIVEFBAR = 89, + BRIG_OPCODE_INITFBAR = 90, + BRIG_OPCODE_JOINFBAR = 91, + BRIG_OPCODE_LEAVEFBAR = 92, + BRIG_OPCODE_RELEASEFBAR = 93, + BRIG_OPCODE_WAITFBAR = 94, + BRIG_OPCODE_LDF = 95, + BRIG_OPCODE_ACTIVELANECOUNT = 96, + BRIG_OPCODE_ACTIVELANEID = 97, + BRIG_OPCODE_ACTIVELANEMASK = 98, + BRIG_OPCODE_ACTIVELANEPERMUTE = 99, + BRIG_OPCODE_CALL = 100, + BRIG_OPCODE_SCALL = 101, + BRIG_OPCODE_ICALL = 102, + BRIG_OPCODE_RET = 103, + BRIG_OPCODE_ALLOCA = 104, + BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105, + BRIG_OPCODE_CURRENTWORKITEMFLATID = 106, + BRIG_OPCODE_DIM = 107, + BRIG_OPCODE_GRIDGROUPS = 108, + BRIG_OPCODE_GRIDSIZE = 109, + BRIG_OPCODE_PACKETCOMPLETIONSIG = 110, + BRIG_OPCODE_PACKETID = 111, + BRIG_OPCODE_WORKGROUPID = 112, + BRIG_OPCODE_WORKGROUPSIZE = 113, + BRIG_OPCODE_WORKITEMABSID = 114, + BRIG_OPCODE_WORKITEMFLATABSID = 115, + BRIG_OPCODE_WORKITEMFLATID = 116, + BRIG_OPCODE_WORKITEMID = 117, + BRIG_OPCODE_CLEARDETECTEXCEPT = 118, + BRIG_OPCODE_GETDETECTEXCEPT = 119, + BRIG_OPCODE_SETDETECTEXCEPT = 120, + BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121, + BRIG_OPCODE_CASQUEUEWRITEINDEX = 122, + BRIG_OPCODE_LDQUEUEREADINDEX = 123, + BRIG_OPCODE_LDQUEUEWRITEINDEX = 124, + BRIG_OPCODE_STQUEUEREADINDEX = 125, + BRIG_OPCODE_STQUEUEWRITEINDEX = 126, + BRIG_OPCODE_CLOCK = 127, + BRIG_OPCODE_CUID = 128, + BRIG_OPCODE_DEBUGTRAP = 129, + BRIG_OPCODE_GROUPBASEPTR = 130, + BRIG_OPCODE_KERNARGBASEPTR = 131, + BRIG_OPCODE_LANEID = 132, + BRIG_OPCODE_MAXCUID = 133, + BRIG_OPCODE_MAXWAVEID = 134, + BRIG_OPCODE_NULLPTR = 135, + BRIG_OPCODE_WAVEID = 136, + + BRIG_OPCODE_FIRST_USER_DEFINED = 32768, +}; + +typedef uint8_t BrigPack8_t; +enum BrigPack { + BRIG_PACK_NONE = 0, + BRIG_PACK_PP = 1, + BRIG_PACK_PS = 2, + BRIG_PACK_SP = 3, + BRIG_PACK_SS = 4, + BRIG_PACK_S = 5, + BRIG_PACK_P = 6, + BRIG_PACK_PPSAT = 7, + BRIG_PACK_PSSAT = 8, + BRIG_PACK_SPSAT = 9, + BRIG_PACK_SSSAT = 10, + BRIG_PACK_SSAT = 11, + BRIG_PACK_PSAT = 12 +}; + +typedef uint8_t BrigProfile8_t; +enum BrigProfile { + BRIG_PROFILE_BASE = 0, + BRIG_PROFILE_FULL = 1, +}; + +typedef uint16_t BrigRegisterKind16_t; +enum BrigRegisterKind { + BRIG_REGISTER_KIND_CONTROL = 0, + BRIG_REGISTER_KIND_SINGLE = 1, + BRIG_REGISTER_KIND_DOUBLE = 2, + BRIG_REGISTER_KIND_QUAD = 3 +}; + +typedef uint8_t BrigRound8_t; +enum BrigRound { + BRIG_ROUND_NONE = 0, + BRIG_ROUND_FLOAT_DEFAULT = 1, + BRIG_ROUND_FLOAT_NEAR_EVEN = 2, + BRIG_ROUND_FLOAT_ZERO = 3, + BRIG_ROUND_FLOAT_PLUS_INFINITY = 4, + BRIG_ROUND_FLOAT_MINUS_INFINITY = 5, + BRIG_ROUND_INTEGER_NEAR_EVEN = 6, + BRIG_ROUND_INTEGER_ZERO = 7, + BRIG_ROUND_INTEGER_PLUS_INFINITY = 8, + BRIG_ROUND_INTEGER_MINUS_INFINITY = 9, + BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10, + BRIG_ROUND_INTEGER_ZERO_SAT = 11, + BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12, + BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13, + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14, + BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15, + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16, + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17, + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18, + BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19, + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20, + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21 +}; + +typedef uint8_t BrigSamplerAddressing8_t; +enum BrigSamplerAddressing { + BRIG_ADDRESSING_UNDEFINED = 0, + BRIG_ADDRESSING_CLAMP_TO_EDGE = 1, + BRIG_ADDRESSING_CLAMP_TO_BORDER = 2, + BRIG_ADDRESSING_REPEAT = 3, + BRIG_ADDRESSING_MIRRORED_REPEAT = 4, + + BRIG_ADDRESSING_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigSamplerCoordNormalization8_t; +enum BrigSamplerCoordNormalization { + BRIG_COORD_UNNORMALIZED = 0, + BRIG_COORD_NORMALIZED = 1 +}; + +typedef uint8_t BrigSamplerFilter8_t; +enum BrigSamplerFilter { + BRIG_FILTER_NEAREST = 0, + BRIG_FILTER_LINEAR = 1, + + BRIG_FILTER_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigSamplerQuery8_t; +enum BrigSamplerQuery { + BRIG_SAMPLER_QUERY_ADDRESSING = 0, + BRIG_SAMPLER_QUERY_COORD = 1, + BRIG_SAMPLER_QUERY_FILTER = 2 +}; + +typedef uint32_t BrigSectionIndex32_t; +enum BrigSectionIndex { + BRIG_SECTION_INDEX_DATA = 0, + BRIG_SECTION_INDEX_CODE = 1, + BRIG_SECTION_INDEX_OPERAND = 2, + + BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3, +}; + +typedef uint8_t BrigSegCvtModifier8_t; +enum BrigSegCvtModifierMask { + BRIG_SEG_CVT_NONULL = 1 +}; + +typedef uint8_t BrigSegment8_t; +enum BrigSegment { + BRIG_SEGMENT_NONE = 0, + BRIG_SEGMENT_FLAT = 1, + BRIG_SEGMENT_GLOBAL = 2, + BRIG_SEGMENT_READONLY = 3, + BRIG_SEGMENT_KERNARG = 4, + BRIG_SEGMENT_GROUP = 5, + BRIG_SEGMENT_PRIVATE = 6, + BRIG_SEGMENT_SPILL = 7, + BRIG_SEGMENT_ARG = 8, + + BRIG_SEGMENT_FIRST_USER_DEFINED = 128 +}; + +enum { + BRIG_TYPE_BASE_SIZE = 5, + BRIG_TYPE_PACK_SIZE = 2, + BRIG_TYPE_ARRAY_SIZE = 1, + + BRIG_TYPE_BASE_SHIFT = 0, + BRIG_TYPE_PACK_SHIFT = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE, + BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE, + + BRIG_TYPE_BASE_MASK = ((1 << BRIG_TYPE_BASE_SIZE) - 1) << BRIG_TYPE_BASE_SHIFT, + BRIG_TYPE_PACK_MASK = ((1 << BRIG_TYPE_PACK_SIZE) - 1) << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT, + + BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_32 = 1 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_64 = 2 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_128 = 3 << BRIG_TYPE_PACK_SHIFT, + + BRIG_TYPE_ARRAY = 1 << BRIG_TYPE_ARRAY_SHIFT +}; + +typedef uint16_t BrigType16_t; +enum BrigType { + BRIG_TYPE_NONE = 0, + BRIG_TYPE_U8 = 1, + BRIG_TYPE_U16 = 2, + BRIG_TYPE_U32 = 3, + BRIG_TYPE_U64 = 4, + BRIG_TYPE_S8 = 5, + BRIG_TYPE_S16 = 6, + BRIG_TYPE_S32 = 7, + BRIG_TYPE_S64 = 8, + BRIG_TYPE_F16 = 9, + BRIG_TYPE_F32 = 10, + BRIG_TYPE_F64 = 11, + BRIG_TYPE_B1 = 12, + BRIG_TYPE_B8 = 13, + BRIG_TYPE_B16 = 14, + BRIG_TYPE_B32 = 15, + BRIG_TYPE_B64 = 16, + BRIG_TYPE_B128 = 17, + BRIG_TYPE_SAMP = 18, + BRIG_TYPE_ROIMG = 19, + BRIG_TYPE_WOIMG = 20, + BRIG_TYPE_RWIMG = 21, + BRIG_TYPE_SIG32 = 22, + BRIG_TYPE_SIG64 = 23, + + BRIG_TYPE_U8X4 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_32, + BRIG_TYPE_U8X8 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_64, + BRIG_TYPE_U8X16 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_128, + BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32, + BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64, + BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128, + BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64, + BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128, + BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S8X4 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_32, + BRIG_TYPE_S8X8 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_64, + BRIG_TYPE_S8X16 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32, + BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64, + BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64, + BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128, + BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32, + BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64, + BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128, + BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64, + BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128, + BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128, + + BRIG_TYPE_U8_ARRAY = BRIG_TYPE_U8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16_ARRAY = BRIG_TYPE_U16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U32_ARRAY = BRIG_TYPE_U32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U64_ARRAY = BRIG_TYPE_U64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8_ARRAY = BRIG_TYPE_S8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16_ARRAY = BRIG_TYPE_S16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S32_ARRAY = BRIG_TYPE_S32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S64_ARRAY = BRIG_TYPE_S64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16_ARRAY = BRIG_TYPE_F16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F32_ARRAY = BRIG_TYPE_F32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F64_ARRAY = BRIG_TYPE_F64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B8_ARRAY = BRIG_TYPE_B8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B16_ARRAY = BRIG_TYPE_B16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B32_ARRAY = BRIG_TYPE_B32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B64_ARRAY = BRIG_TYPE_B64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B128_ARRAY = BRIG_TYPE_B128 | BRIG_TYPE_ARRAY, + BRIG_TYPE_SAMP_ARRAY = BRIG_TYPE_SAMP | BRIG_TYPE_ARRAY, + BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY, + BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY, + BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY, + BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U8X4_ARRAY = BRIG_TYPE_U8X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U8X8_ARRAY = BRIG_TYPE_U8X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8X4_ARRAY = BRIG_TYPE_S8X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8X8_ARRAY = BRIG_TYPE_S8X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY, +}; + +typedef uint8_t BrigVariableModifier8_t; +enum BrigVariableModifierMask { + BRIG_VARIABLE_DEFINITION = 1, + BRIG_VARIABLE_CONST = 2 +}; + +typedef uint8_t BrigWidth8_t; +enum BrigWidth { + BRIG_WIDTH_NONE = 0, + BRIG_WIDTH_1 = 1, + BRIG_WIDTH_2 = 2, + BRIG_WIDTH_4 = 3, + BRIG_WIDTH_8 = 4, + BRIG_WIDTH_16 = 5, + BRIG_WIDTH_32 = 6, + BRIG_WIDTH_64 = 7, + BRIG_WIDTH_128 = 8, + BRIG_WIDTH_256 = 9, + BRIG_WIDTH_512 = 10, + BRIG_WIDTH_1024 = 11, + BRIG_WIDTH_2048 = 12, + BRIG_WIDTH_4096 = 13, + BRIG_WIDTH_8192 = 14, + BRIG_WIDTH_16384 = 15, + BRIG_WIDTH_32768 = 16, + BRIG_WIDTH_65536 = 17, + BRIG_WIDTH_131072 = 18, + BRIG_WIDTH_262144 = 19, + BRIG_WIDTH_524288 = 20, + BRIG_WIDTH_1048576 = 21, + BRIG_WIDTH_2097152 = 22, + BRIG_WIDTH_4194304 = 23, + BRIG_WIDTH_8388608 = 24, + BRIG_WIDTH_16777216 = 25, + BRIG_WIDTH_33554432 = 26, + BRIG_WIDTH_67108864 = 27, + BRIG_WIDTH_134217728 = 28, + BRIG_WIDTH_268435456 = 29, + BRIG_WIDTH_536870912 = 30, + BRIG_WIDTH_1073741824 = 31, + BRIG_WIDTH_2147483648 = 32, + BRIG_WIDTH_WAVESIZE = 33, + BRIG_WIDTH_ALL = 34, +}; + +struct BrigUInt64 { + uint32_t lo; + uint32_t hi; +}; + +struct BrigBase { + uint16_t byteCount; + BrigKind16_t kind; +}; + +struct BrigData { + uint32_t byteCount; + uint8_t bytes[1]; +}; + +struct BrigDirectiveArgBlock { + BrigBase base; +}; + +struct BrigDirectiveComment { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveControl { + BrigBase base; + BrigControlDirective16_t control; + uint16_t reserved; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveExecutable { + BrigBase base; + BrigDataOffsetString32_t name; + uint16_t outArgCount; + uint16_t inArgCount; + BrigCodeOffset32_t firstInArg; + BrigCodeOffset32_t firstCodeBlockEntry; + BrigCodeOffset32_t nextModuleEntry; + BrigExecutableModifier8_t modifier; + BrigLinkage8_t linkage; + uint16_t reserved; +}; + +struct BrigDirectiveExtension { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveFbarrier { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVariableModifier8_t modifier; + BrigLinkage8_t linkage; + uint16_t reserved; +}; + +struct BrigDirectiveLabel { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveLoc { + BrigBase base; + BrigDataOffsetString32_t filename; + uint32_t line; + uint32_t column; +}; + +struct BrigDirectiveNone { + BrigBase base; +}; + +struct BrigDirectivePragma { + BrigBase base; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveVariable { + BrigBase base; + BrigDataOffsetString32_t name; + BrigOperandOffset32_t init; + BrigType16_t type; + BrigSegment8_t segment; + BrigAlignment8_t align; + BrigUInt64 dim; + BrigVariableModifier8_t modifier; + BrigLinkage8_t linkage; + BrigAllocation8_t allocation; + uint8_t reserved; +}; + +struct BrigDirectiveModule { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVersion32_t hsailMajor; + BrigVersion32_t hsailMinor; + BrigProfile8_t profile; + BrigMachineModel8_t machineModel; + BrigRound8_t defaultFloatRound; + uint8_t reserved; +}; + +struct BrigInstBase { + BrigBase base; + BrigOpcode16_t opcode; + BrigType16_t type; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigInstAddr { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; +}; + +struct BrigInstAtomic { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t memoryScope; + BrigAtomicOperation8_t atomicOperation; + uint8_t equivClass; + uint8_t reserved[3]; +}; + +struct BrigInstBasic { + BrigInstBase base; +}; + +struct BrigInstBr { + BrigInstBase base; + BrigWidth8_t width; + uint8_t reserved[3]; +}; + +struct BrigInstCmp { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier8_t modifier; + BrigCompareOperation8_t compare; + BrigPack8_t pack; + uint8_t reserved[3]; +}; + +struct BrigInstCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier8_t modifier; + BrigRound8_t round; +}; + +struct BrigInstImage { + BrigInstBase base; + BrigType16_t imageType; + BrigType16_t coordType; + BrigImageGeometry8_t geometry; + uint8_t equivClass; + uint16_t reserved; +}; + +struct BrigInstLane { + BrigInstBase base; + BrigType16_t sourceType; + BrigWidth8_t width; + uint8_t reserved; +}; + +struct BrigInstMem { + BrigInstBase base; + BrigSegment8_t segment; + BrigAlignment8_t align; + uint8_t equivClass; + BrigWidth8_t width; + BrigMemoryModifier8_t modifier; + uint8_t reserved[3]; +}; + +struct BrigInstMemFence { + BrigInstBase base; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t globalSegmentMemoryScope; + BrigMemoryScope8_t groupSegmentMemoryScope; + BrigMemoryScope8_t imageSegmentMemoryScope; +}; + +struct BrigInstMod { + BrigInstBase base; + BrigAluModifier8_t modifier; + BrigRound8_t round; + BrigPack8_t pack; + uint8_t reserved; +}; + +struct BrigInstQueryImage { + BrigInstBase base; + BrigType16_t imageType; + BrigImageGeometry8_t geometry; + BrigImageQuery8_t query; +}; + +struct BrigInstQuerySampler { + BrigInstBase base; + BrigSamplerQuery8_t query; + uint8_t reserved[3]; +}; + +struct BrigInstQueue { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + uint16_t reserved; +}; + +struct BrigInstSeg { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; +}; + +struct BrigInstSegCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigSegment8_t segment; + BrigSegCvtModifier8_t modifier; +}; + +struct BrigInstSignal { + BrigInstBase base; + BrigType16_t signalType; + BrigMemoryOrder8_t memoryOrder; + BrigAtomicOperation8_t signalOperation; +}; + +struct BrigInstSourceType { + BrigInstBase base; + BrigType16_t sourceType; + uint16_t reserved; +}; + +struct BrigOperandAddress { + BrigBase base; + BrigCodeOffset32_t symbol; + BrigOperandOffset32_t reg; + BrigUInt64 offset; +}; + +struct BrigOperandAlign { + BrigBase base; + BrigAlignment8_t align; + uint8_t reserved[3]; +}; + +struct BrigOperandCodeList { + BrigBase base; + BrigDataOffsetCodeList32_t elements; +}; + +struct BrigOperandCodeRef { + BrigBase base; + BrigCodeOffset32_t ref; +}; + +struct BrigOperandConstantBytes { + BrigBase base; + BrigType16_t type; + uint16_t reserved; + BrigDataOffsetString32_t bytes; +}; + +struct BrigOperandConstantOperandList { + BrigBase base; + BrigType16_t type; + uint16_t reserved; + BrigDataOffsetOperandList32_t elements; +}; + +struct BrigOperandConstantImage { + BrigBase base; + BrigType16_t type; + BrigImageGeometry8_t geometry; + BrigImageChannelOrder8_t channelOrder; + BrigImageChannelType8_t channelType; + uint8_t reserved[3]; + BrigUInt64 width; + BrigUInt64 height; + BrigUInt64 depth; + BrigUInt64 array; +}; + +struct BrigOperandOperandList { + BrigBase base; + BrigDataOffsetOperandList32_t elements; +}; + +struct BrigOperandRegister { + BrigBase base; + BrigRegisterKind16_t regKind; + uint16_t regNum; +}; + +struct BrigOperandConstantSampler { + BrigBase base; + BrigType16_t type; + BrigSamplerCoordNormalization8_t coord; + BrigSamplerFilter8_t filter; + BrigSamplerAddressing8_t addressing; + uint8_t reserved[3]; +}; + +struct BrigOperandString { + BrigBase base; + BrigDataOffsetString32_t string; +}; + +struct BrigOperandWavesize { + BrigBase base; +}; + +typedef uint32_t BrigExceptions32_t; +enum BrigExceptionsMask { + BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0, + BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1, + BRIG_EXCEPTIONS_OVERFLOW = 1 << 2, + BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3, + BRIG_EXCEPTIONS_INEXACT = 1 << 4, + + BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16 +}; + +struct BrigSectionHeader { + uint64_t byteCount; + uint32_t headerByteCount; + uint32_t nameLength; + uint8_t name[1]; +}; + +struct BrigModuleHeader { + char identification[8]; + BrigVersion32_t brigMajor; + BrigVersion32_t brigMinor; + uint64_t byteCount; + uint8_t hash[64]; + uint32_t reserved; + uint32_t sectionCount; + uint64_t sectionIndex; +}; + +typedef BrigModuleHeader* BrigModule_t; + +#ifdef __cplusplus +} +#endif /*__cplusplus*/ + +#endif // defined(INCLUDED_BRIG_H) diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_common.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_common.h new file mode 100644 index 0000000000..7c4ed3eea4 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_common.h @@ -0,0 +1,91 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// The following set of header files provides definitions for AMD GPU +// Architecture: +// - amd_hsa_common.h +// - amd_hsa_elf.h +// - amd_hsa_kernel_code.h +// - amd_hsa_queue.h +// - amd_hsa_signal.h +// +// Refer to "HSA Application Binary Interface: AMD GPU Architecture" for more +// information. + +#ifndef AMD_HSA_COMMON_H +#define AMD_HSA_COMMON_H + +#include +#include + +// Descriptive version of the HSA Application Binary Interface. +#define AMD_HSA_ABI_VERSION "AMD GPU Architecture v0.35 (June 25, 2015)" + +// Alignment attribute that specifies a minimum alignment (in bytes) for +// variables of the specified type. +#if defined(__GNUC__) +# define __ALIGNED__(x) __attribute__((aligned(x))) +#elif defined(_MSC_VER) +# define __ALIGNED__(x) __declspec(align(x)) +#elif defined(RC_INVOKED) +# define __ALIGNED__(x) +#else +# error +#endif + +// Creates enumeration entries for packed types. Enumeration entries include +// bit shift amount, bit width, and bit mask. +#define AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width) \ + name##_SHIFT = (shift), \ + name##_WIDTH = (width), \ + name = (((1 << (width)) - 1) << (shift)) \ + +// Gets bits for specified mask from specified src packed instance. +#define AMD_HSA_BITS_GET(src, mask) \ + ((src & mask) >> mask ## _SHIFT) \ + +// Sets val bits for specified mask in specified dst packed instance. +#define AMD_HSA_BITS_SET(dst, mask, val) \ + dst &= (~(1 << mask##_SHIFT) & ~mask); \ + dst |= (((val) << mask##_SHIFT) & mask) \ + +#endif // AMD_HSA_COMMON_H diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_elf.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_elf.h new file mode 100644 index 0000000000..2b6c4c9672 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_elf.h @@ -0,0 +1,467 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Undefine the macro in case it is defined in the system elf.h. +#undef EM_AMDGPU + +#ifndef AMD_HSA_ELF_H +#define AMD_HSA_ELF_H + +// AMD GPU Specific ELF Header Enumeration Values. +// +// Values are copied from LLVM BinaryFormat/ELF.h . This file also contains +// code object V1 defintions which are not part of the LLVM header. Code object +// V1 was only supported by the Finalizer which is now deprecated and removed. +// +// TODO: Deprecate and remove V1 support and replace this header with using the +// LLVM header. +namespace ELF { + +// Machine architectures +// See current registered ELF machine architectures at: +// http://www.uxsglobal.com/developers/gabi/latest/ch4.eheader.html +enum { + EM_AMDGPU = 224, // AMD GPU architecture +}; + +// OS ABI identification. +enum { + ELFOSABI_AMDGPU_HSA = 64, // AMD HSA runtime +}; + +// AMDGPU OS ABI Version identification. +enum { + // ELFABIVERSION_AMDGPU_HSA_V1 does not exist because OS ABI identification + // was never defined for V1. + ELFABIVERSION_AMDGPU_HSA_V2 = 0, + ELFABIVERSION_AMDGPU_HSA_V3 = 1, + ELFABIVERSION_AMDGPU_HSA_V4 = 2, + ELFABIVERSION_AMDGPU_HSA_V5 = 3, + ELFABIVERSION_AMDGPU_HSA_V6 = 4, +}; + +// AMDGPU specific e_flags. +enum : unsigned { + // Processor selection mask for EF_AMDGPU_MACH_* values. + EF_AMDGPU_MACH = 0x0ff, + + // Not specified processor. + EF_AMDGPU_MACH_NONE = 0x000, + + // AMDGCN-based processors. + // clang-format off + EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020, + EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021, + EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022, + EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023, + EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024, + EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025, + EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27 = 0x027, + EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028, + EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029, + EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a, + EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b, + EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c, + EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d, + EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e, + EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, + EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030, + EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031, + EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032, + EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, + EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034, + EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035, + EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036, + EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037, + EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038, + EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039, + EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a, + EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b, + EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c, + EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d, + EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e, + EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, + EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040, + EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, + EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042, + EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043, + EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044, + EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045, + EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046, + EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047, + EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49 = 0x049, + EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a, + EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b, + EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d, + EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e, + EF_AMDGPU_MACH_AMDGCN_GFX950 = 0x04f, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050, + EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051, + EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052, + EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC = 0x053, + EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC = 0x054, + EF_AMDGPU_MACH_AMDGCN_GFX1152 = 0x055, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X56 = 0x056, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57 = 0x057, + EF_AMDGPU_MACH_AMDGCN_GFX1153 = 0x058, + EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC = 0x059, + EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC = 0x05f, + // clang-format on + + // First/last AMDGCN-based processors. + EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, + + // Indicates if the "xnack" target feature is enabled for all code contained + // in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2. + EF_AMDGPU_FEATURE_XNACK_V2 = 0x01, + // Indicates if the trap handler is enabled for all code contained + // in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2. + EF_AMDGPU_FEATURE_TRAP_HANDLER_V2 = 0x02, + + // Indicates if the "xnack" target feature is enabled for all code contained + // in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3. + EF_AMDGPU_FEATURE_XNACK_V3 = 0x100, + // Indicates if the "sramecc" target feature is enabled for all code + // contained in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3. + EF_AMDGPU_FEATURE_SRAMECC_V3 = 0x200, + + // XNACK selection mask for EF_AMDGPU_FEATURE_XNACK_* values. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4. + EF_AMDGPU_FEATURE_XNACK_V4 = 0x300, + // XNACK is not supported. + EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000, + // XNACK is any/default/unspecified. + EF_AMDGPU_FEATURE_XNACK_ANY_V4 = 0x100, + // XNACK is off. + EF_AMDGPU_FEATURE_XNACK_OFF_V4 = 0x200, + // XNACK is on. + EF_AMDGPU_FEATURE_XNACK_ON_V4 = 0x300, + + // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4. + EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00, + // SRAMECC is not supported. + EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000, + // SRAMECC is any/default/unspecified. + EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 = 0x400, + // SRAMECC is off. + EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800, + // SRAMECC is on. + EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00, + + // Generic target versioning. This is contained in the list byte of EFLAGS. + EF_AMDGPU_GENERIC_VERSION = 0xff000000, + EF_AMDGPU_GENERIC_VERSION_OFFSET = 24, + EF_AMDGPU_GENERIC_VERSION_MIN = 1, + EF_AMDGPU_GENERIC_VERSION_MAX = 0xff, +}; + +// ELF Relocation types for AMDGPU. +enum : unsigned { + R_AMDGPU_ABS32_LO = 1, + R_AMDGPU_ABS32_HI = 2, + R_AMDGPU_ABS64 = 3, + R_AMDGPU_ABS32 = 6, + R_AMDGPU_RELATIVE64 = 13, +}; + +} // end namespace ELF + +// ELF Section Header Flag Enumeration Values. +#define SHF_AMDGPU_HSA_GLOBAL (0x00100000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_READONLY (0x00200000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_CODE (0x00400000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_AGENT (0x00800000 & SHF_MASKOS) + +// +typedef enum { + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM = 0, + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT = 1, + AMDGPU_HSA_SEGMENT_READONLY_AGENT = 2, + AMDGPU_HSA_SEGMENT_CODE_AGENT = 3, + AMDGPU_HSA_SEGMENT_LAST, +} amdgpu_hsa_elf_segment_t; + +// ELF Program Header Type Enumeration Values. +#define PT_AMDGPU_HSA_LOAD_GLOBAL_PROGRAM (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM) +#define PT_AMDGPU_HSA_LOAD_GLOBAL_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT) +#define PT_AMDGPU_HSA_LOAD_READONLY_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_READONLY_AGENT) +#define PT_AMDGPU_HSA_LOAD_CODE_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_CODE_AGENT) + +// ELF Symbol Type Enumeration Values. +#define STT_AMDGPU_HSA_KERNEL (STT_LOOS + 0) +#define STT_AMDGPU_HSA_INDIRECT_FUNCTION (STT_LOOS + 1) +#define STT_AMDGPU_HSA_METADATA (STT_LOOS + 2) + +// ELF Symbol Binding Enumeration Values. +#define STB_AMDGPU_HSA_EXTERNAL (STB_LOOS + 0) + +// ELF Symbol Other Information Creation/Retrieval. +#define ELF64_ST_AMDGPU_ALLOCATION(o) (((o) >> 2) & 0x3) +#define ELF64_ST_AMDGPU_FLAGS(o) ((o) >> 4) +#define ELF64_ST_AMDGPU_OTHER(f, a, v) (((f) << 4) + (((a) & 0x3) << 2) + ((v) & 0x3)) + +typedef enum { + AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT = 0, + AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM = 1, + AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT = 2, + AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT = 3, + AMDGPU_HSA_SYMBOL_ALLOCATION_LAST, +} amdgpu_hsa_symbol_allocation_t; + +// ELF Symbol Allocation Enumeration Values. +#define STA_AMDGPU_HSA_DEFAULT AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT +#define STA_AMDGPU_HSA_GLOBAL_PROGRAM AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM +#define STA_AMDGPU_HSA_GLOBAL_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT +#define STA_AMDGPU_HSA_READONLY_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT + +typedef enum { + AMDGPU_HSA_SYMBOL_FLAG_DEFAULT = 0, + AMDGPU_HSA_SYMBOL_FLAG_CONST = 1, + AMDGPU_HSA_SYMBOL_FLAG_LAST, +} amdgpu_hsa_symbol_flag_t; + +// ELF Symbol Flag Enumeration Values. +#define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST + +// Legacy/V1 AMD GPU Relocation Type Enumeration Values. +#define R_AMDGPU_V1_NONE 0 +#define R_AMDGPU_V1_32_LOW 1 +#define R_AMDGPU_V1_32_HIGH 2 +#define R_AMDGPU_V1_64 3 +#define R_AMDGPU_V1_INIT_SAMPLER 4 +#define R_AMDGPU_V1_INIT_IMAGE 5 +#define R_AMDGPU_V1_RELATIVE64 13 + +// AMD GPU Note Type Enumeration Values. +#define NT_AMD_HSA_CODE_OBJECT_VERSION 1 +#define NT_AMD_HSA_HSAIL 2 +#define NT_AMD_HSA_ISA_VERSION 3 +#define NT_AMD_HSA_PRODUCER 4 +#define NT_AMD_HSA_PRODUCER_OPTIONS 5 +#define NT_AMD_HSA_EXTENSION 6 +#define NT_AMD_HSA_ISA_NAME 11 +/* AMDGPU snapshots of runtime, agent and queues state for use in core dump */ +#define NT_AMDGPU_CORE_STATE 33 +#define NT_AMD_HSA_HLDEBUG_DEBUG 101 +#define NT_AMD_HSA_HLDEBUG_TARGET 102 + +// AMD GPU Metadata Kind Enumeration Values. +typedef uint16_t amdgpu_hsa_metadata_kind16_t; +typedef enum { + AMDGPU_HSA_METADATA_KIND_NONE = 0, + AMDGPU_HSA_METADATA_KIND_INIT_SAMP = 1, + AMDGPU_HSA_METADATA_KIND_INIT_ROIMG = 2, + AMDGPU_HSA_METADATA_KIND_INIT_WOIMG = 3, + AMDGPU_HSA_METADATA_KIND_INIT_RWIMG = 4 +} amdgpu_hsa_metadata_kind_t; + +// AMD GPU Sampler Coordinate Normalization Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_coord8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_COORD_UNNORMALIZED = 0, + AMDGPU_HSA_SAMPLER_COORD_NORMALIZED = 1 +} amdgpu_hsa_sampler_coord_t; + +// AMD GPU Sampler Filter Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_filter8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_FILTER_NEAREST = 0, + AMDGPU_HSA_SAMPLER_FILTER_LINEAR = 1 +} amdgpu_hsa_sampler_filter_t; + +// AMD GPU Sampler Addressing Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_addressing8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_ADDRESSING_UNDEFINED = 0, + AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_EDGE = 1, + AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_BORDER = 2, + AMDGPU_HSA_SAMPLER_ADDRESSING_REPEAT = 3, + AMDGPU_HSA_SAMPLER_ADDRESSING_MIRRORED_REPEAT = 4 +} amdgpu_hsa_sampler_addressing_t; + +// AMD GPU Sampler Descriptor. +typedef struct amdgpu_hsa_sampler_descriptor_s { + uint16_t size; + amdgpu_hsa_metadata_kind16_t kind; + amdgpu_hsa_sampler_coord8_t coord; + amdgpu_hsa_sampler_filter8_t filter; + amdgpu_hsa_sampler_addressing8_t addressing; + uint8_t reserved1; +} amdgpu_hsa_sampler_descriptor_t; + +// AMD GPU Image Geometry Enumeration Values. +typedef uint8_t amdgpu_hsa_image_geometry8_t; +typedef enum { + AMDGPU_HSA_IMAGE_GEOMETRY_1D = 0, + AMDGPU_HSA_IMAGE_GEOMETRY_2D = 1, + AMDGPU_HSA_IMAGE_GEOMETRY_3D = 2, + AMDGPU_HSA_IMAGE_GEOMETRY_1DA = 3, + AMDGPU_HSA_IMAGE_GEOMETRY_2DA = 4, + AMDGPU_HSA_IMAGE_GEOMETRY_1DB = 5, + AMDGPU_HSA_IMAGE_GEOMETRY_2DDEPTH = 6, + AMDGPU_HSA_IMAGE_GEOMETRY_2DADEPTH = 7 +} amdgpu_hsa_image_geometry_t; + +// AMD GPU Image Channel Order Enumeration Values. +typedef uint8_t amdgpu_hsa_image_channel_order8_t; +typedef enum { + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_A = 0, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_R = 1, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RX = 2, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RG = 3, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGX = 4, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RA = 5, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGB = 6, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBX = 7, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBA = 8, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_BGRA = 9, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ARGB = 10, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ABGR = 11, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGB = 12, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBX = 13, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBA = 14, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SBGRA = 15, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_INTENSITY = 16, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_LUMINANCE = 17, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH = 18, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19 +} amdgpu_hsa_image_channel_order_t; + +// AMD GPU Image Channel Type Enumeration Values. +typedef uint8_t amdgpu_hsa_image_channel_type8_t; +typedef enum { + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_555 = 5, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_565 = 6, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_INT_101010 = 7, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_FLOAT = 15 +} amdgpu_hsa_image_channel_type_t; + +// AMD GPU Image Descriptor. +typedef struct amdgpu_hsa_image_descriptor_s { + uint16_t size; + amdgpu_hsa_metadata_kind16_t kind; + amdgpu_hsa_image_geometry8_t geometry; + amdgpu_hsa_image_channel_order8_t channel_order; + amdgpu_hsa_image_channel_type8_t channel_type; + uint8_t reserved1; + uint64_t width; + uint64_t height; + uint64_t depth; + uint64_t array; +} amdgpu_hsa_image_descriptor_t; + +typedef struct amdgpu_hsa_note_code_object_version_s { + uint32_t major_version; + uint32_t minor_version; +} amdgpu_hsa_note_code_object_version_t; + +typedef struct amdgpu_hsa_note_hsail_s { + uint32_t hsail_major_version; + uint32_t hsail_minor_version; + uint8_t profile; + uint8_t machine_model; + uint8_t default_float_round; +} amdgpu_hsa_note_hsail_t; + +typedef struct amdgpu_hsa_note_isa_s { + uint16_t vendor_name_size; + uint16_t architecture_name_size; + uint32_t major; + uint32_t minor; + uint32_t stepping; + char vendor_and_architecture_name[1]; +} amdgpu_hsa_note_isa_t; + +typedef struct amdgpu_hsa_note_producer_s { + uint16_t producer_name_size; + uint16_t reserved; + uint32_t producer_major_version; + uint32_t producer_minor_version; + char producer_name[1]; +} amdgpu_hsa_note_producer_t; + +typedef struct amdgpu_hsa_note_producer_options_s { + uint16_t producer_options_size; + char producer_options[1]; +} amdgpu_hsa_note_producer_options_t; + +typedef enum { + AMDGPU_HSA_RODATA_GLOBAL_PROGRAM = 0, + AMDGPU_HSA_RODATA_GLOBAL_AGENT, + AMDGPU_HSA_RODATA_READONLY_AGENT, + AMDGPU_HSA_DATA_GLOBAL_PROGRAM, + AMDGPU_HSA_DATA_GLOBAL_AGENT, + AMDGPU_HSA_DATA_READONLY_AGENT, + AMDGPU_HSA_BSS_GLOBAL_PROGRAM, + AMDGPU_HSA_BSS_GLOBAL_AGENT, + AMDGPU_HSA_BSS_READONLY_AGENT, + AMDGPU_HSA_SECTION_LAST, +} amdgpu_hsa_elf_section_t; + +#endif // AMD_HSA_ELF_H diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_kernel_code.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_kernel_code.h new file mode 100644 index 0000000000..c00c88c024 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_kernel_code.h @@ -0,0 +1,270 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_KERNEL_CODE_H +#define AMD_HSA_KERNEL_CODE_H + +#include "amd_hsa_common.h" +#include "hsa.h" + +// AMD Kernel Code Version Enumeration Values. +typedef uint32_t amd_kernel_code_version32_t; +enum amd_kernel_code_version_t { + AMD_KERNEL_CODE_VERSION_MAJOR = 1, + AMD_KERNEL_CODE_VERSION_MINOR = 1 +}; + +// AMD Machine Kind Enumeration Values. +typedef uint16_t amd_machine_kind16_t; +enum amd_machine_kind_t { + AMD_MACHINE_KIND_UNDEFINED = 0, + AMD_MACHINE_KIND_AMDGPU = 1 +}; + +// AMD Machine Version. +typedef uint16_t amd_machine_version16_t; + +// AMD Float Round Mode Enumeration Values. +enum amd_float_round_mode_t { + AMD_FLOAT_ROUND_MODE_NEAREST_EVEN = 0, + AMD_FLOAT_ROUND_MODE_PLUS_INFINITY = 1, + AMD_FLOAT_ROUND_MODE_MINUS_INFINITY = 2, + AMD_FLOAT_ROUND_MODE_ZERO = 3 +}; + +// AMD Float Denorm Mode Enumeration Values. +enum amd_float_denorm_mode_t { + AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT = 0, + AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT = 1, + AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE = 2, + AMD_FLOAT_DENORM_MODE_NO_FLUSH = 3 +}; + +// AMD Compute Program Resource Register One. +typedef uint32_t amd_compute_pgm_rsrc_one32_t; +enum amd_compute_pgm_rsrc_one_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT, 0, 6), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY, 10, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32, 12, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64, 14, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32, 16, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 18, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 20, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP, 21, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE, 22, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 23, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_BULKY, 24, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER, 25, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1, 26, 6) +}; + +// AMD System VGPR Workitem ID Enumeration Values. +enum amd_system_vgpr_workitem_id_t { + AMD_SYSTEM_VGPR_WORKITEM_ID_X = 0, + AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y = 1, + AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2, + AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3 +}; + +// AMD Compute Program Resource Register Two. +typedef uint32_t amd_compute_pgm_rsrc_two32_t; +enum amd_compute_pgm_rsrc_two_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 1, 5), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER, 6, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 7, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO, 10, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID, 11, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION, 14, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE, 15, 9), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO, 30, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1, 31, 1) +}; + +// AMD Element Byte Size Enumeration Values. +enum amd_element_byte_size_t { + AMD_ELEMENT_BYTE_SIZE_2 = 0, + AMD_ELEMENT_BYTE_SIZE_4 = 1, + AMD_ELEMENT_BYTE_SIZE_8 = 2, + AMD_ELEMENT_BYTE_SIZE_16 = 3 +}; + +// AMD Kernel Code Properties. +typedef uint32_t amd_kernel_code_properties32_t; +enum amd_kernel_code_properties_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR, 1, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR, 2, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID, 4, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, 7, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, 8, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z, 9, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32, 10, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED1, 11, 5), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS, 16, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE, 17, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_PTR64, 19, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK, 20, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED, 21, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED, 22, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED2, 23, 9) +}; + +// AMD Power Of Two Enumeration Values. +typedef uint8_t amd_powertwo8_t; +enum amd_powertwo_t { + AMD_POWERTWO_1 = 0, + AMD_POWERTWO_2 = 1, + AMD_POWERTWO_4 = 2, + AMD_POWERTWO_8 = 3, + AMD_POWERTWO_16 = 4, + AMD_POWERTWO_32 = 5, + AMD_POWERTWO_64 = 6, + AMD_POWERTWO_128 = 7, + AMD_POWERTWO_256 = 8 +}; + +// AMD Enabled Control Directive Enumeration Values. +typedef uint64_t amd_enabled_control_directive64_t; +enum amd_enabled_control_directive_t { + AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS = 1, + AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS = 2, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE = 4, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE = 8, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE = 16, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM = 32, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE = 64, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE = 128, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS = 256 +}; + +// AMD Exception Kind Enumeration Values. +typedef uint16_t amd_exception_kind16_t; +enum amd_exception_kind_t { + AMD_EXCEPTION_KIND_INVALID_OPERATION = 1, + AMD_EXCEPTION_KIND_DIVISION_BY_ZERO = 2, + AMD_EXCEPTION_KIND_OVERFLOW = 4, + AMD_EXCEPTION_KIND_UNDERFLOW = 8, + AMD_EXCEPTION_KIND_INEXACT = 16 +}; + +// AMD Control Directives. +#define AMD_CONTROL_DIRECTIVES_ALIGN_BYTES 64 +#define AMD_CONTROL_DIRECTIVES_ALIGN __ALIGNED__(AMD_CONTROL_DIRECTIVES_ALIGN_BYTES) +typedef AMD_CONTROL_DIRECTIVES_ALIGN struct amd_control_directives_s { + amd_enabled_control_directive64_t enabled_control_directives; + uint16_t enable_break_exceptions; + uint16_t enable_detect_exceptions; + uint32_t max_dynamic_group_size; + uint64_t max_flat_grid_size; + uint32_t max_flat_workgroup_size; + uint8_t required_dim; + uint8_t reserved1[3]; + uint64_t required_grid_size[3]; + uint32_t required_workgroup_size[3]; + uint8_t reserved2[60]; +} amd_control_directives_t; + +// AMD Kernel Code. +#define AMD_ISA_ALIGN_BYTES 256 +#define AMD_KERNEL_CODE_ALIGN_BYTES 64 +#define AMD_KERNEL_CODE_ALIGN __ALIGNED__(AMD_KERNEL_CODE_ALIGN_BYTES) +typedef AMD_KERNEL_CODE_ALIGN struct amd_kernel_code_s { + amd_kernel_code_version32_t amd_kernel_code_version_major; + amd_kernel_code_version32_t amd_kernel_code_version_minor; + amd_machine_kind16_t amd_machine_kind; + amd_machine_version16_t amd_machine_version_major; + amd_machine_version16_t amd_machine_version_minor; + amd_machine_version16_t amd_machine_version_stepping; + int64_t kernel_code_entry_byte_offset; + int64_t kernel_code_prefetch_byte_offset; + uint64_t kernel_code_prefetch_byte_size; + uint64_t max_scratch_backing_memory_byte_size; + amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1; + amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2; + amd_kernel_code_properties32_t kernel_code_properties; + uint32_t workitem_private_segment_byte_size; + uint32_t workgroup_group_segment_byte_size; + uint32_t gds_segment_byte_size; + uint64_t kernarg_segment_byte_size; + uint32_t workgroup_fbarrier_count; + uint16_t wavefront_sgpr_count; + uint16_t workitem_vgpr_count; + uint16_t reserved_vgpr_first; + uint16_t reserved_vgpr_count; + uint16_t reserved_sgpr_first; + uint16_t reserved_sgpr_count; + uint16_t debug_wavefront_private_segment_offset_sgpr; + uint16_t debug_private_segment_buffer_sgpr; + amd_powertwo8_t kernarg_segment_alignment; + amd_powertwo8_t group_segment_alignment; + amd_powertwo8_t private_segment_alignment; + amd_powertwo8_t wavefront_size; + int32_t call_convention; + uint8_t reserved1[12]; + uint64_t runtime_loader_kernel_symbol; + amd_control_directives_t control_directives; +} amd_kernel_code_t; + +// TODO: this struct should be completely gone once debugger designs/implements +// Debugger APIs. +typedef struct amd_runtime_loader_debug_info_s { + const void* elf_raw; + size_t elf_size; + const char *kernel_name; + const void *owning_segment; +} amd_runtime_loader_debug_info_t; + +#endif // AMD_HSA_KERNEL_CODE_H diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_queue.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_queue.h new file mode 100644 index 0000000000..9f16f9b2e5 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_queue.h @@ -0,0 +1,154 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_QUEUE_H +#define AMD_HSA_QUEUE_H + +#include "amd_hsa_common.h" +#include "hsa.h" + +// AMD Queue Properties. +typedef uint32_t amd_queue_properties32_t; +enum amd_queue_properties_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_IS_PTR64, 1, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE, 4, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 5, 27) +}; + +// AMD Queue. +#define AMD_QUEUE_ALIGN_BYTES 64 +#define AMD_QUEUE_ALIGN __ALIGNED__(AMD_QUEUE_ALIGN_BYTES) + +// AMD Queue Capabilities. +typedef uint32_t amd_queue_capabilities32_t; +enum amd_queue_capabilities_t { + /* This version of CP FW supports dual-scratch and async-reclaim */ + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_CAPS_CP_ASYNC_RECLAIM, 0, 1), + + /* + * This version of ROCr supports async-reclaim and CP FW may access the + * V2 fields. + */ + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_CAPS_SW_ASYNC_RECLAIM, 1, 1), +}; + +/* This is the original amd_queue_t structure. The definition is only kept + * for reference purposes. This structure should not be used. */ +typedef struct AMD_QUEUE_ALIGN amd_queue_s { + hsa_queue_t hsa_queue; + uint32_t caps; + uint32_t reserved1[3]; + volatile uint64_t write_dispatch_id; + uint32_t group_segment_aperture_base_hi; + uint32_t private_segment_aperture_base_hi; + uint32_t max_cu_id; + uint32_t max_wave_id; + volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1; + volatile uint32_t legacy_doorbell_lock; + uint32_t reserved2[9]; + volatile uint64_t read_dispatch_id; + uint32_t read_dispatch_id_field_base_byte_offset; + uint32_t compute_tmpring_size; + uint32_t scratch_resource_descriptor[4]; + uint64_t scratch_backing_memory_location; + uint32_t reserved3[2]; + uint32_t scratch_wave64_lane_byte_size; + amd_queue_properties32_t queue_properties; + uint32_t reserved4[2]; + hsa_signal_t queue_inactive_signal; + uint32_t reserved5[14]; +} amd_queue_t; + +/* + * AMD_QUEUE Version 2 + * amd_queue_v2_t is backwards compatible with amd_queue_t structure and can + * be used with previous versions of CP FW. The added fields tagged as V2 are + * ignored when running previous versions of CP FW. + * CP FW will not try to access elements beyond the original 64-bytes + * (sizeof(amd_queue_t)) unless the AMD_QUEUE_CAPS_SW_ASYNC_RECLAIM bit is set. + */ + +#define MAX_NUM_XCC 128 +typedef struct scratch_last_used_index_xcc_s { + volatile uint64_t main; + volatile uint64_t alt; +} scratch_last_used_index_xcc_t; + +typedef struct AMD_QUEUE_ALIGN amd_queue_v2_s { + hsa_queue_t hsa_queue; + uint32_t caps; + uint32_t reserved1[3]; + volatile uint64_t write_dispatch_id; + uint32_t group_segment_aperture_base_hi; + uint32_t private_segment_aperture_base_hi; + uint32_t max_cu_id; + uint32_t max_wave_id; + volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1; + volatile uint32_t legacy_doorbell_lock; + uint32_t reserved2[9]; + volatile uint64_t read_dispatch_id; + uint32_t read_dispatch_id_field_base_byte_offset; + uint32_t compute_tmpring_size; + uint32_t scratch_resource_descriptor[4]; + uint64_t scratch_backing_memory_location; + uint64_t scratch_backing_memory_byte_size; + uint32_t scratch_wave64_lane_byte_size; + amd_queue_properties32_t queue_properties; + volatile uint64_t scratch_max_use_index; /* V2 */ + hsa_signal_t queue_inactive_signal; + volatile uint64_t alt_scratch_max_use_index; /* V2 */ + uint32_t alt_scratch_resource_descriptor[4]; /* V2 */ + uint64_t alt_scratch_backing_memory_location; /* V2 */ + uint32_t alt_scratch_dispatch_limit_x; /* V2 */ + uint32_t alt_scratch_dispatch_limit_y; /* V2 */ + uint32_t alt_scratch_dispatch_limit_z; /* V2 */ + uint32_t alt_scratch_wave64_lane_byte_size; /* V2 */ + uint32_t alt_compute_tmpring_size; /* V2 */ + uint32_t reserved5; + + scratch_last_used_index_xcc_t scratch_last_used_index[MAX_NUM_XCC]; +} amd_queue_v2_t; + +#endif // AMD_HSA_QUEUE_H diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_signal.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_signal.h new file mode 100644 index 0000000000..fa797599a0 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/amd_hsa_signal.h @@ -0,0 +1,79 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_SIGNAL_H +#define AMD_HSA_SIGNAL_H + +#include "amd_hsa_common.h" +#include "amd_hsa_queue.h" + +// AMD Signal Kind Enumeration Values. +typedef int64_t amd_signal_kind64_t; +enum amd_signal_kind_t { + AMD_SIGNAL_KIND_INVALID = 0, + AMD_SIGNAL_KIND_USER = 1, + AMD_SIGNAL_KIND_DOORBELL = -1, + AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2 +}; + +// AMD Signal. +#define AMD_SIGNAL_ALIGN_BYTES 64 +#define AMD_SIGNAL_ALIGN __ALIGNED__(AMD_SIGNAL_ALIGN_BYTES) +typedef struct AMD_SIGNAL_ALIGN amd_signal_s { + amd_signal_kind64_t kind; + union { + volatile int64_t value; + volatile uint64_t* hardware_doorbell_ptr; + }; + uint64_t event_mailbox_ptr; + uint32_t event_id; + uint32_t reserved1; + uint64_t start_ts; + uint64_t end_ts; + union { + amd_queue_v2_t* queue_ptr; + uint64_t reserved2; + }; + uint32_t reserved3[2]; +} amd_signal_t; + +#endif // AMD_HSA_SIGNAL_H diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa.h new file mode 100644 index 0000000000..00753e992e --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa.h @@ -0,0 +1,5752 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_H_ +#define HSA_RUNTIME_INC_HSA_H_ + +#include /* size_t */ +#include /* uintXX_t */ + +#ifndef __cplusplus +#include /* bool */ +#endif /* __cplusplus */ + +// Placeholder for calling convention and import/export macros +#ifndef HSA_CALL +#define HSA_CALL +#endif + +#ifndef HSA_EXPORT_DECORATOR +#ifdef __GNUC__ +#define HSA_EXPORT_DECORATOR __attribute__ ((visibility ("default"))) +#else +#define HSA_EXPORT_DECORATOR +#endif +#endif +#define HSA_API_EXPORT HSA_EXPORT_DECORATOR HSA_CALL +#define HSA_API_IMPORT HSA_CALL + +#if !defined(HSA_API) && defined(HSA_EXPORT) +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +// Detect and set large model builds. +#undef HSA_LARGE_MODEL +#if defined(__LP64__) || defined(_M_X64) +#define HSA_LARGE_MODEL +#endif + +// Try to detect CPU endianness +#if !defined(LITTLEENDIAN_CPU) && !defined(BIGENDIAN_CPU) +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define LITTLEENDIAN_CPU +#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define BIGENDIAN_CPU +#elif defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) || defined(__loongarch64) || defined(__riscv) +#define LITTLEENDIAN_CPU +#endif +#endif + +#undef HSA_LITTLE_ENDIAN +#if defined(LITTLEENDIAN_CPU) +#define HSA_LITTLE_ENDIAN +#elif defined(BIGENDIAN_CPU) +#else +#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined" +#endif + +#ifndef HSA_DEPRECATED +#define HSA_DEPRECATED +//#ifdef __GNUC__ +//#define HSA_DEPRECATED __attribute__((deprecated)) +//#else +//#define HSA_DEPRECATED __declspec(deprecated) +//#endif +#endif + +#define HSA_VERSION_1_0 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** \addtogroup error-codes Error codes + * @{ + */ + +/** + * @brief Status codes. + */ +typedef enum { + /** + * The function has been executed successfully. + */ + HSA_STATUS_SUCCESS = 0x0, + /** + * A traversal over a list of elements has been interrupted by the + * application before completing. + */ + HSA_STATUS_INFO_BREAK = 0x1, + /** + * A generic error has occurred. + */ + HSA_STATUS_ERROR = 0x1000, + /** + * One of the actual arguments does not meet a precondition stated in the + * documentation of the corresponding formal argument. + */ + HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001, + /** + * The requested queue creation is not valid. + */ + HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002, + /** + * The requested allocation is not valid. + */ + HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003, + /** + * The agent is invalid. + */ + HSA_STATUS_ERROR_INVALID_AGENT = 0x1004, + /** + * The memory region is invalid. + */ + HSA_STATUS_ERROR_INVALID_REGION = 0x1005, + /** + * The signal is invalid. + */ + HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006, + /** + * The queue is invalid. + */ + HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007, + /** + * The HSA runtime failed to allocate the necessary resources. This error + * may also occur when the HSA runtime needs to spawn threads or create + * internal OS-specific events. + */ + HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008, + /** + * The AQL packet is malformed. + */ + HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009, + /** + * An error has been detected while releasing a resource. + */ + HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A, + /** + * An API other than ::hsa_init has been invoked while the reference count + * of the HSA runtime is 0. + */ + HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B, + /** + * The maximum reference count for the object has been reached. + */ + HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C, + /** + * The arguments passed to a functions are not compatible. + */ + HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D, + /** + * The index is invalid. + */ + HSA_STATUS_ERROR_INVALID_INDEX = 0x100E, + /** + * The instruction set architecture is invalid. + */ + HSA_STATUS_ERROR_INVALID_ISA = 0x100F, + /** + * The instruction set architecture name is invalid. + */ + HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017, + /** + * The code object is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010, + /** + * The executable is invalid. + */ + HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011, + /** + * The executable is frozen. + */ + HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012, + /** + * There is no symbol with the given name. + */ + HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013, + /** + * The variable is already defined. + */ + HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014, + /** + * The variable is undefined. + */ + HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015, + /** + * An HSAIL operation resulted in a hardware exception. + */ + HSA_STATUS_ERROR_EXCEPTION = 0x1016, + /** + * The code object symbol is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_SYMBOL = 0x1018, + /** + * The executable symbol is invalid. + */ + HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL = 0x1019, + /** + * The file descriptor is invalid. + */ + HSA_STATUS_ERROR_INVALID_FILE = 0x1020, + /** + * The code object reader is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER = 0x1021, + /** + * The cache is invalid. + */ + HSA_STATUS_ERROR_INVALID_CACHE = 0x1022, + /** + * The wavefront is invalid. + */ + HSA_STATUS_ERROR_INVALID_WAVEFRONT = 0x1023, + /** + * The signal group is invalid. + */ + HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP = 0x1024, + /** + * The HSA runtime is not in the configuration state. + */ + HSA_STATUS_ERROR_INVALID_RUNTIME_STATE = 0x1025, + /** + * The queue received an error that may require process termination. + */ + HSA_STATUS_ERROR_FATAL = 0x1026 +} hsa_status_t; + +/** + * @brief Query additional information about a status code. + * + * @param[in] status Status code. + * + * @param[out] status_string A NUL-terminated string that describes the error + * status. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p status is an invalid + * status code, or @p status_string is NULL. + */ +hsa_status_t HSA_API hsa_status_string( + hsa_status_t status, + const char ** status_string); + +/** @} */ + +/** \defgroup common Common Definitions + * @{ + */ + +/** + * @brief Three-dimensional coordinate. + */ +typedef struct hsa_dim3_s { + /** + * X dimension. + */ + uint32_t x; + + /** + * Y dimension. + */ + uint32_t y; + + /** + * Z dimension. + */ + uint32_t z; +} hsa_dim3_t; + +/** + * @brief Access permissions. + */ +typedef enum { + /** + * Used to remove existing access + */ + HSA_ACCESS_PERMISSION_NONE = 0, + /** + * Read-only access. + */ + HSA_ACCESS_PERMISSION_RO = 1, + /** + * Write-only access. + */ + HSA_ACCESS_PERMISSION_WO = 2, + /** + * Read and write access. + */ + HSA_ACCESS_PERMISSION_RW = 3 +} hsa_access_permission_t; + +/** + * @brief POSIX file descriptor. + */ +typedef int hsa_file_t; + +/** @} **/ + + +/** \defgroup initshutdown Initialization and Shut Down + * @{ + */ + +/** + * @brief Initialize the HSA runtime. + * + * @details Initializes the HSA runtime if it is not already initialized, and + * increases the reference counter associated with the HSA runtime for the + * current process. Invocation of any HSA function other than ::hsa_init results + * in undefined behavior if the current HSA runtime reference counter is less + * than one. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_REFCOUNT_OVERFLOW The HSA runtime reference + * count reaches INT32_MAX. + */ +hsa_status_t HSA_API hsa_init(); + +/** + * @brief Shut down the HSA runtime. + * + * @details Decreases the reference count of the HSA runtime instance. When the + * reference count reaches 0, the HSA runtime is no longer considered valid + * but the application might call ::hsa_init to initialize the HSA runtime + * again. + * + * Once the reference count of the HSA runtime reaches 0, all the resources + * associated with it (queues, signals, agent information, etc.) are + * considered invalid and any attempt to reference them in subsequent API calls + * results in undefined behavior. When the reference count reaches 0, the HSA + * runtime may release resources associated with it. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_shut_down(); + +/** @} **/ + +/** \defgroup agentinfo System and Agent Information + * @{ + */ + +/** + * @brief Endianness. A convention used to interpret the bytes making up a data + * word. + */ +typedef enum { + /** + * The least significant byte is stored in the smallest address. + */ + HSA_ENDIANNESS_LITTLE = 0, + /** + * The most significant byte is stored in the smallest address. + */ + HSA_ENDIANNESS_BIG = 1 +} hsa_endianness_t; + +/** + * @brief Machine model. A machine model determines the size of certain data + * types in HSA runtime and an agent. + */ +typedef enum { + /** + * Small machine model. Addresses use 32 bits. + */ + HSA_MACHINE_MODEL_SMALL = 0, + /** + * Large machine model. Addresses use 64 bits. + */ + HSA_MACHINE_MODEL_LARGE = 1 +} hsa_machine_model_t; + +/** + * @brief Profile. A profile indicates a particular level of feature + * support. For example, in the base profile the application must use the HSA + * runtime allocator to reserve shared virtual memory, while in the full profile + * any host pointer can be shared across all the agents. + */ +typedef enum { + /** + * Base profile. + */ + HSA_PROFILE_BASE = 0, + /** + * Full profile. + */ + HSA_PROFILE_FULL = 1 +} hsa_profile_t; + +/** + * @brief System attributes. + */ +typedef enum { + /** + * Major version of the HSA runtime specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_SYSTEM_INFO_VERSION_MAJOR = 0, + /** + * Minor version of the HSA runtime specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_SYSTEM_INFO_VERSION_MINOR = 1, + /** + * Current timestamp. The value of this attribute monotonically increases at a + * constant rate. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_TIMESTAMP = 2, + /** + * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is + * in the range 1-400MHz. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3, + /** + * Maximum duration of a signal wait operation. Expressed as a count based on + * the timestamp frequency. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT = 4, + /** + * Endianness of the system. The type of this attribute is ::hsa_endianness_t. + */ + HSA_SYSTEM_INFO_ENDIANNESS = 5, + /** + * Machine model supported by the HSA runtime. The type of this attribute is + * ::hsa_machine_model_t. + */ + HSA_SYSTEM_INFO_MACHINE_MODEL = 6, + /** + * Bit-mask indicating which extensions are supported by the + * implementation. An extension with an ID of @p i is supported if the bit at + * position @p i is set. The type of this attribute is uint8_t[128]. + */ + HSA_SYSTEM_INFO_EXTENSIONS = 7, + /** + * String containing the ROCr build identifier. + */ + HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200, + /** + * Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201, + // TODO: Should this be per Agent? + /** + * Returns true if all Agents have access to system allocated memory (such as + * that allocated by mmap, malloc, or new) by default. + * If false then system allocated memory may only be made SVM accessible to + * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes. + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202, + /** + * Returns true if mwaitx is enabled on this system + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_MWAITX_ENABLED = 0x203, + /** + * Returns true if DMABUF APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_DMABUF_SUPPORTED = 0x204, + /** + * Returns true if Virtual Memory APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED = 0x205, + /** + * Returns true if XNACK is enabled on this system. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_XNACK_ENABLED = 0x206, + /** + * Major version of the HSA runtime extension specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_AMD_SYSTEM_INFO_EXT_VERSION_MAJOR = 0x207, + /** + * Minor version of the HSA runtime extension specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_AMD_SYSTEM_INFO_EXT_VERSION_MINOR = 0x208, +} hsa_system_info_t; + +/** + * @brief Get the current value of a system attribute. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * system attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_system_get_info( + hsa_system_info_t attribute, + void* value); + +/** + * @brief HSA extensions. + */ +typedef enum { + /** + * Finalizer extension. + */ + HSA_EXTENSION_FINALIZER = 0, + /** + * Images extension. + */ + HSA_EXTENSION_IMAGES = 1, + + /** + * Performance counter extension. + */ + HSA_EXTENSION_PERFORMANCE_COUNTERS = 2, + + /** + * Profiling events extension. + */ + HSA_EXTENSION_PROFILING_EVENTS = 3, + /** + * Extension count. + */ + HSA_EXTENSION_STD_LAST = 3, + /** + * First AMD extension number. + */ + HSA_AMD_FIRST_EXTENSION = 0x200, + /** + * Profiler extension. + */ + HSA_EXTENSION_AMD_PROFILER = 0x200, + /** + * Loader extension. + */ + HSA_EXTENSION_AMD_LOADER = 0x201, + /** + * AqlProfile extension. + */ + HSA_EXTENSION_AMD_AQLPROFILE = 0x202, + /** + * PC Sampling extension. + */ + HSA_EXTENSION_AMD_PC_SAMPLING = 0x203, + /** + * Last AMD extension. + */ + HSA_AMD_LAST_EXTENSION = 0x203 +} hsa_extension_t; + +/** + * @brief Query the name of a given extension. + * + * @param[in] extension Extension identifier. If the extension is not supported + * by the implementation (see ::HSA_SYSTEM_INFO_EXTENSIONS), the behavior + * is undefined. + * + * @param[out] name Pointer to a memory location where the HSA runtime stores + * the extension name. The extension name is a NUL-terminated string. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p name is NULL. + */ +hsa_status_t HSA_API hsa_extension_get_name( + uint16_t extension, + const char **name); + +/** + * @deprecated + * + * @brief Query if a given version of an extension is supported by the HSA + * implementation. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number. + * + * @param[in] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p result is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_system_extension_supported( + uint16_t extension, + uint16_t version_major, + uint16_t version_minor, + bool* result); + +/** + * @brief Query if a given version of an extension is supported by the HSA + * implementation. All minor versions from 0 up to the returned @p version_minor + * must be supported by the implementation. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number. + * + * @param[out] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p version_minor is NULL, or @p result is NULL. + */ +hsa_status_t HSA_API hsa_system_major_extension_supported( + uint16_t extension, + uint16_t version_major, + uint16_t *version_minor, + bool* result); + + +/** + * @deprecated + * + * @brief Retrieve the function pointers corresponding to a given version of an + * extension. Portable applications are expected to invoke the extension API + * using the returned function pointers + * + * @details The application is responsible for verifying that the given version + * of the extension is supported by the HSA implementation (see + * ::hsa_system_extension_supported). If the given combination of extension, + * major version, and minor version is not supported by the implementation, the + * behavior is undefined. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number for which to retrieve the + * function pointer table. + * + * @param[in] version_minor Minor version number for which to retrieve the + * function pointer table. + * + * @param[out] table Pointer to an application-allocated function pointer table + * that is populated by the HSA runtime. Must not be NULL. The memory associated + * with table can be reused or freed after the function returns. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p table is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_system_get_extension_table( + uint16_t extension, + uint16_t version_major, + uint16_t version_minor, + void *table); + +/** + * @brief Retrieve the function pointers corresponding to a given major version + * of an extension. Portable applications are expected to invoke the extension + * API using the returned function pointers. + * + * @details The application is responsible for verifying that the given major + * version of the extension is supported by the HSA implementation (see + * ::hsa_system_major_extension_supported). If the given combination of extension + * and major version is not supported by the implementation, the behavior is + * undefined. Additionally if the length doesn't allow space for a full minor + * version, it is implementation defined if only some of the function pointers for + * that minor version get written. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number for which to retrieve the + * function pointer table. + * + * @param[in] table_length Size in bytes of the function pointer table to be + * populated. The implementation will not write more than this many bytes to the + * table. + * + * @param[out] table Pointer to an application-allocated function pointer table + * that is populated by the HSA runtime. Must not be NULL. The memory associated + * with table can be reused or freed after the function returns. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p table is NULL. + */ +hsa_status_t HSA_API hsa_system_get_major_extension_table( + uint16_t extension, + uint16_t version_major, + size_t table_length, + void *table); + +/** + * @brief Struct containing an opaque handle to an agent, a device that participates in + * the HSA memory model. An agent can submit AQL packets for execution, and + * may also accept AQL packets for execution (agent dispatch packets or kernel + * dispatch packets launching HSAIL-derived binaries). + */ +typedef struct hsa_agent_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_agent_t; + +/** + * @brief Agent features. + */ +typedef enum { + /** + * The agent supports AQL packets of kernel dispatch type. If this + * feature is enabled, the agent is also a kernel agent. + */ + HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1, + /** + * The agent supports AQL packets of agent dispatch type. + */ + HSA_AGENT_FEATURE_AGENT_DISPATCH = 2 +} hsa_agent_feature_t; + +/** + * @brief Hardware device type. + */ +typedef enum { + /** + * CPU device. + */ + HSA_DEVICE_TYPE_CPU = 0, + /** + * GPU device. + */ + HSA_DEVICE_TYPE_GPU = 1, + /** + * DSP device. + */ + HSA_DEVICE_TYPE_DSP = 2, + /** + * AI Engine (AIE) device. + */ + HSA_DEVICE_TYPE_AIE = 3 +} hsa_device_type_t; + +/** + * @brief Default floating-point rounding mode. + */ +typedef enum { + /** + * Use a default floating-point rounding mode specified elsewhere. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0, + /** + * Operations that specify the default floating-point mode are rounded to zero + * by default. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1, + /** + * Operations that specify the default floating-point mode are rounded to the + * nearest representable number and that ties should be broken by selecting + * the value with an even least significant bit. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2 +} hsa_default_float_rounding_mode_t; + +/** + * @brief Agent attributes. + */ +typedef enum { + /** + * Agent name. The type of this attribute is a NUL-terminated char[64]. The + * name must be at most 63 characters long (not including the NUL terminator) + * and all array elements not used for the name must be NUL. + */ + HSA_AGENT_INFO_NAME = 0, + /** + * Name of vendor. The type of this attribute is a NUL-terminated char[64]. + * The name must be at most 63 characters long (not including the NUL + * terminator) and all array elements not used for the name must be NUL. + */ + HSA_AGENT_INFO_VENDOR_NAME = 1, + /** + * Agent capability. The type of this attribute is ::hsa_agent_feature_t. + */ + HSA_AGENT_INFO_FEATURE = 2, + /** + * @deprecated Query ::HSA_ISA_INFO_MACHINE_MODELS for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Machine model supported by the agent. The type of this attribute is + * ::hsa_machine_model_t. + */ + HSA_AGENT_INFO_MACHINE_MODEL = 3, + /** + * @deprecated Query ::HSA_ISA_INFO_PROFILES for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Profile supported by the agent. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_AGENT_INFO_PROFILE = 4, + /** + * @deprecated Query ::HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES for a given + * intruction set architecture supported by the agent instead. If more than + * one ISA is supported by the agent, the returned value corresponds to the + * first ISA enumerated by ::hsa_agent_iterate_isas. + * + * Default floating-point rounding mode. The type of this attribute is + * ::hsa_default_float_rounding_mode_t, but the value + * ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT is not allowed. + */ + HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5, + /** + * @deprecated Query ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES + * for a given intruction set architecture supported by the agent instead. If + * more than one ISA is supported by the agent, the returned value corresponds + * to the first ISA enumerated by ::hsa_agent_iterate_isas. + * + * A bit-mask of ::hsa_default_float_rounding_mode_t values, representing the + * default floating-point rounding modes supported by the agent in the Base + * profile. The type of this attribute is uint32_t. The default floating-point + * rounding mode (::HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE) bit must not + * be set. + */ + HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 23, + /** + * @deprecated Query ::HSA_ISA_INFO_FAST_F16_OPERATION for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Flag indicating that the f16 HSAIL operation is at least as fast as the + * f32 operation in the current agent. The value of this attribute is + * undefined if the agent is not a kernel agent. The type of this + * attribute is bool. + */ + HSA_AGENT_INFO_FAST_F16_OPERATION = 24, + /** + * @deprecated Query ::HSA_WAVEFRONT_INFO_SIZE for a given wavefront and + * intruction set architecture supported by the agent instead. If more than + * one ISA is supported by the agent, the returned value corresponds to the + * first ISA enumerated by ::hsa_agent_iterate_isas and the first wavefront + * enumerated by ::hsa_isa_iterate_wavefronts for that ISA. + * + * Number of work-items in a wavefront. Must be a power of 2 in the range + * [1,256]. The value of this attribute is undefined if the agent is not + * a kernel agent. The type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_WAVEFRONT_SIZE = 6, + /** + * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_DIM for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum number of work-items of each dimension of a work-group. Each + * maximum must be greater than 0. No maximum can exceed the value of + * ::HSA_AGENT_INFO_WORKGROUP_MAX_SIZE. The value of this attribute is + * undefined if the agent is not a kernel agent. The type of this + * attribute is uint16_t[3]. + */ + HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7, + /** + * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum total number of work-items in a work-group. The value of this + * attribute is undefined if the agent is not a kernel agent. The type + * of this attribute is uint32_t. + */ + HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8, + /** + * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_DIM for a given intruction set + * architecture supported by the agent instead. + * + * Maximum number of work-items of each dimension of a grid. Each maximum must + * be greater than 0, and must not be smaller than the corresponding value in + * ::HSA_AGENT_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of + * ::HSA_AGENT_INFO_GRID_MAX_SIZE. The value of this attribute is undefined + * if the agent is not a kernel agent. The type of this attribute is + * ::hsa_dim3_t. + */ + HSA_AGENT_INFO_GRID_MAX_DIM = 9, + /** + * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_SIZE for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum total number of work-items in a grid. The value of this attribute + * is undefined if the agent is not a kernel agent. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_GRID_MAX_SIZE = 10, + /** + * @deprecated Query ::HSA_ISA_INFO_FBARRIER_MAX_SIZE for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum number of fbarriers per work-group. Must be at least 32. The value + * of this attribute is undefined if the agent is not a kernel agent. The + * type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11, + /** + * @deprecated The maximum number of queues is not statically determined. + * + * Maximum number of queues that can be active (created but not destroyed) at + * one time in the agent. The type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_QUEUES_MAX = 12, + /** + * Minimum number of packets that a queue created in the agent + * can hold. Must be a power of 2 greater than 0. Must not exceed + * the value of ::HSA_AGENT_INFO_QUEUE_MAX_SIZE. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13, + /** + * Maximum number of packets that a queue created in the agent can + * hold. Must be a power of 2 greater than 0. The type of this attribute + * is uint32_t. + */ + HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14, + /** + * Type of a queue created in the agent. The type of this attribute is + * ::hsa_queue_type32_t. + */ + HSA_AGENT_INFO_QUEUE_TYPE = 15, + /** + * @deprecated NUMA information is not exposed anywhere else in the API. + * + * Identifier of the NUMA node associated with the agent. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_NODE = 16, + /** + * Type of hardware device associated with the agent. The type of this + * attribute is ::hsa_device_type_t. + */ + HSA_AGENT_INFO_DEVICE = 17, + /** + * @deprecated Query ::hsa_agent_iterate_caches to retrieve information about + * the caches present in a given agent. + * + * Array of data cache sizes (L1..L4). Each size is expressed in bytes. A size + * of 0 for a particular level indicates that there is no cache information + * for that level. The type of this attribute is uint32_t[4]. + */ + HSA_AGENT_INFO_CACHE_SIZE = 18, + /** + * @deprecated An agent may support multiple instruction set + * architectures. See ::hsa_agent_iterate_isas. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Instruction set architecture of the agent. The type of this attribute + * is ::hsa_isa_t. + */ + HSA_AGENT_INFO_ISA = 19, + /** + * Bit-mask indicating which extensions are supported by the agent. An + * extension with an ID of @p i is supported if the bit at position @p i is + * set. The type of this attribute is uint8_t[128]. + */ + HSA_AGENT_INFO_EXTENSIONS = 20, + /** + * Major version of the HSA runtime specification supported by the + * agent. The type of this attribute is uint16_t. + */ + HSA_AGENT_INFO_VERSION_MAJOR = 21, + /** + * Minor version of the HSA runtime specification supported by the + * agent. The type of this attribute is uint16_t. + */ + HSA_AGENT_INFO_VERSION_MINOR = 22, + /** + * This enum does not have a fixed underlying type, thus in C++ post D2338: + * If the enumeration type does not have a fixed underlying type, the value is + * unchanged if the original value is within the range of the enumeration + * values (9.7.1 [dcl.enum]), and otherwise, the behavior is + * undefined. + * Thus increase the range of this enum to encompass vendor extensions. + */ + HSA_AGENT_INFO_LAST = INT32_MAX +} hsa_agent_info_t; + +/** + * @brief Get the current value of an attribute for a given agent. + * + * @param[in] agent A valid agent. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * agent attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_agent_get_info( + hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +/** + * @brief Iterate over the available agents, and invoke an + * application-defined callback on every iteration. + * + * @param[in] callback Callback to be invoked once per agent. The HSA + * runtime passes two arguments to the callback: the agent and the + * application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_iterate_agents returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. +*/ +hsa_status_t HSA_API hsa_iterate_agents( + hsa_status_t (*callback)(hsa_agent_t agent, void* data), + void* data); + +/* + +// If we do not know the size of an attribute, we need to query it first +// Note: this API will not be in the spec unless needed +hsa_status_t HSA_API hsa_agent_get_info_size( + hsa_agent_t agent, + hsa_agent_info_t attribute, + size_t* size); + +// Set the value of an agents attribute +// Note: this API will not be in the spec unless needed +hsa_status_t HSA_API hsa_agent_set_info( + hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +*/ + +/** + * @brief Exception policies applied in the presence of hardware exceptions. + */ +typedef enum { + /** + * If a hardware exception is detected, a work-item signals an exception. + */ + HSA_EXCEPTION_POLICY_BREAK = 1, + /** + * If a hardware exception is detected, a hardware status bit is set. + */ + HSA_EXCEPTION_POLICY_DETECT = 2 +} hsa_exception_policy_t; + +/** + * @deprecated Use ::hsa_isa_get_exception_policies for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, this function uses the first value returned by + * ::hsa_agent_iterate_isas. + * + * @brief Retrieve the exception policy support for a given combination of + * agent and profile + * + * @param[in] agent Agent. + * + * @param[in] profile Profile. + * + * @param[out] mask Pointer to a memory location where the HSA runtime stores a + * mask of ::hsa_exception_policy_t values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid + * profile, or @p mask is NULL. + * + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_get_exception_policies( + hsa_agent_t agent, + hsa_profile_t profile, + uint16_t *mask); + +/** + * @brief Cache handle. + */ +typedef struct hsa_cache_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_cache_t; + +/** + * @brief Cache attributes. + */ +typedef enum { + /** + * The length of the cache name in bytes, not including the NUL terminator. + * The type of this attribute is uint32_t. + */ + HSA_CACHE_INFO_NAME_LENGTH = 0, + /** + * Human-readable description. The type of this attribute is a NUL-terminated + * character array with the length equal to the value of + * ::HSA_CACHE_INFO_NAME_LENGTH attribute. + */ + HSA_CACHE_INFO_NAME = 1, + /** + * Cache level. A L1 cache must return a value of 1, a L2 must return a value + * of 2, and so on. The type of this attribute is uint8_t. + */ + HSA_CACHE_INFO_LEVEL = 2, + /** + * Cache size, in bytes. A value of 0 indicates that there is no size + * information available. The type of this attribute is uint32_t. + */ + HSA_CACHE_INFO_SIZE = 3 +} hsa_cache_info_t; + +/** + * @brief Get the current value of an attribute for a given cache object. + * + * @param[in] cache Cache. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CACHE The cache is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is + * NULL. + */ +hsa_status_t HSA_API hsa_cache_get_info( + hsa_cache_t cache, + hsa_cache_info_t attribute, + void* value); + +/** + * @brief Iterate over the memory caches of a given agent, and + * invoke an application-defined callback on every iteration. + * + * @details Caches are visited in ascending order according to the value of the + * ::HSA_CACHE_INFO_LEVEL attribute. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per cache that is present in + * the agent. The HSA runtime passes two arguments to the callback: the cache + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * that value is returned. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_caches( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_cache_t cache, void* data), + void* data); + +/** + * @deprecated + * + * @brief Query if a given version of an extension is supported by an agent + * + * @param[in] extension Extension identifier. + * + * @param[in] agent Agent. + * + * @param[in] version_major Major version number. + * + * @param[in] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. The result must be false if + * ::hsa_system_extension_supported returns false for the same extension + * version. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p result is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_extension_supported( + uint16_t extension, + hsa_agent_t agent, + uint16_t version_major, + uint16_t version_minor, + bool* result); + +/** + * @brief Query if a given version of an extension is supported by an agent. All + * minor versions from 0 up to the returned @p version_minor must be supported. + * + * @param[in] extension Extension identifier. + * + * @param[in] agent Agent. + * + * @param[in] version_major Major version number. + * + * @param[out] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. The result must be false if + * ::hsa_system_extension_supported returns false for the same extension + * version. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p version_minor is NULL, or @p result is NULL. + */ +hsa_status_t HSA_API hsa_agent_major_extension_supported( + uint16_t extension, + hsa_agent_t agent, + uint16_t version_major, + uint16_t *version_minor, + bool* result); + + +/** @} */ + + +/** \defgroup signals Signals + * @{ + */ + +/** + * @brief Signal handle. + */ +typedef struct hsa_signal_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. The value 0 is reserved. + */ + uint64_t handle; +} hsa_signal_t; + +/** + * @brief Signal value. The value occupies 32 bits in small machine mode, and 64 + * bits in large machine mode. + */ +#ifdef HSA_LARGE_MODEL + typedef int64_t hsa_signal_value_t; +#else + typedef int32_t hsa_signal_value_t; +#endif + +/** + * @brief Create a signal. + * + * @param[in] initial_value Initial value of the signal. + * + * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that + * any agent might wait on the signal. + * + * @param[in] consumers List of agents that might consume (wait on) the + * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the + * HSA runtime might use the list to optimize the handling of the signal + * object. If an agent not listed in @p consumers waits on the returned + * signal, the behavior is undefined. The memory associated with @p consumers + * can be reused or freed after the function returns. + * + * @param[out] signal Pointer to a memory location where the HSA runtime will + * store the newly created signal handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p + * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers + * contains duplicates. + */ +hsa_status_t HSA_API hsa_signal_create( + hsa_signal_value_t initial_value, + uint32_t num_consumers, + const hsa_agent_t *consumers, + hsa_signal_t *signal); + +/** + * @brief Destroy a signal previous created by ::hsa_signal_create. + * + * @param[in] signal Signal. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The handle in @p signal is 0. + */ +hsa_status_t HSA_API hsa_signal_destroy( + hsa_signal_t signal); + +/** + * @brief Atomically read the current value of a signal. + * + * @param[in] signal Signal. + * + * @return Value of the signal. +*/ +hsa_signal_value_t HSA_API hsa_signal_load_scacquire( + hsa_signal_t signal); + +/** + * @copydoc hsa_signal_load_scacquire + */ +hsa_signal_value_t HSA_API hsa_signal_load_relaxed( + hsa_signal_t signal); + +/** + * @deprecated Renamed as ::hsa_signal_load_scacquire. + * + * @copydoc hsa_signal_load_scacquire +*/ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_load_acquire( + hsa_signal_t signal); + +/** + * @brief Atomically set the value of a signal. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. + * + * @param[in] value New signal value. + */ +void HSA_API hsa_signal_store_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_store_relaxed + */ +void HSA_API hsa_signal_store_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_store_screlease. + * + * @copydoc hsa_signal_store_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_store_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal without necessarily notifying the + * the agents waiting on it. + * + * @details The agents waiting on @p signal may not wake up even when the new + * value satisfies their wait condition. If the application wants to update the + * signal and there is no need to notify any agent, invoking this function can + * be more efficient than calling the non-silent counterpart. + * + * @param[in] signal Signal. + * + * @param[in] value New signal value. + */ +void HSA_API hsa_signal_silent_store_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_silent_store_relaxed + */ +void HSA_API hsa_signal_silent_store_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal and return its previous value. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value New value. + * + * @return Value of the signal prior to the exchange. + * + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_exchange_scacq_screl. + * + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_exchange_scacquire. + * + * @copydoc hsa_signal_exchange_scacquire + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); +/** + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_exchange_screlease. + * + * @copydoc hsa_signal_exchange_screlease + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal if the observed value is equal to + * the expected value. The observed value is returned regardless of whether the + * replacement was done. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue + * doorbell signal, the behavior is undefined. + * + * @param[in] expected Value to compare with. + * + * @param[in] value New value. + * + * @return Observed value of the signal. + * + */ +hsa_signal_value_t HSA_API hsa_signal_cas_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_cas_scacq_screl. + * + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_cas_scacquire( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_cas_scacquire. + * + * @copydoc hsa_signal_cas_scacquire + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acquire( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_cas_relaxed( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_cas_screlease( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_cas_screlease. + * + * @copydoc hsa_signal_cas_screlease + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_release( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @brief Atomically increment the value of a signal by a given amount. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to add to the value of the signal. + * + */ +void HSA_API hsa_signal_add_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_add_scacq_screl. + * + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_add_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API hsa_signal_add_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_add_scacquire. + * + * @copydoc hsa_signal_add_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_add_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API hsa_signal_add_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API hsa_signal_add_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_add_screlease. + * + * @copydoc hsa_signal_add_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_add_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically decrement the value of a signal by a given amount. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to subtract from the value of the signal. + * + */ +void HSA_API hsa_signal_subtract_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_subtract_scacq_screl. + * + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_subtract_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API hsa_signal_subtract_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_subtract_scacquire. + * + * @copydoc hsa_signal_subtract_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_subtract_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API hsa_signal_subtract_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API hsa_signal_subtract_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_subtract_screlease. + * + * @copydoc hsa_signal_subtract_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_subtract_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise AND operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to AND with the value of the signal. + * + */ +void HSA_API hsa_signal_and_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_and_scacq_screl. + * + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_and_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API hsa_signal_and_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_and_scacquire. + * + * @copydoc hsa_signal_and_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_and_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API hsa_signal_and_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API hsa_signal_and_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_and_screlease. + * + * @copydoc hsa_signal_and_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_and_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise OR operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to OR with the value of the signal. + */ +void HSA_API hsa_signal_or_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_or_scacq_screl. + * + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_or_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API hsa_signal_or_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_or_scacquire. + * + * @copydoc hsa_signal_or_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_or_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API hsa_signal_or_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API hsa_signal_or_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_or_screlease. + * + * @copydoc hsa_signal_or_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_or_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise XOR operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to XOR with the value of the signal. + * + */ +void HSA_API hsa_signal_xor_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_xor_scacq_screl. + * + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_xor_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API hsa_signal_xor_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_xor_scacquire. + * + * @copydoc hsa_signal_xor_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_xor_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API hsa_signal_xor_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API hsa_signal_xor_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_xor_screlease. + * + * @copydoc hsa_signal_xor_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_xor_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Wait condition operator. + */ +typedef enum { + /** + * The two operands are equal. + */ + HSA_SIGNAL_CONDITION_EQ = 0, + /** + * The two operands are not equal. + */ + HSA_SIGNAL_CONDITION_NE = 1, + /** + * The first operand is less than the second operand. + */ + HSA_SIGNAL_CONDITION_LT = 2, + /** + * The first operand is greater than or equal to the second operand. + */ + HSA_SIGNAL_CONDITION_GTE = 3 +} hsa_signal_condition_t; + +/** + * @brief State of the application thread during a signal wait. + */ +typedef enum { + /** + * The application thread may be rescheduled while waiting on the signal. + */ + HSA_WAIT_STATE_BLOCKED = 0, + /** + * The application thread stays active while waiting on a signal. + */ + HSA_WAIT_STATE_ACTIVE = 1 +} hsa_wait_state_t; + + +/** + * @brief Wait until a signal value satisfies a specified condition, or a + * certain amount of time has elapsed. + * + * @details A wait operation can spuriously resume at any time sooner than the + * timeout (for example, due to system or other external factors) even when the + * condition has not been met. + * + * The function is guaranteed to return if the signal value satisfies the + * condition at some point in time during the wait, but the value returned to + * the application might not satisfy the condition. The application must ensure + * that signals are used in such way that wait wakeup conditions are not + * invalidated before dependent threads have woken up. + * + * When the wait operation internally loads the value of the passed signal, it + * uses the memory order indicated in the function name. + * + * @param[in] signal Signal. + * + * @param[in] condition Condition used to compare the signal value with @p + * compare_value. + * + * @param[in] compare_value Value to compare with. + * + * @param[in] timeout_hint Maximum duration of the wait. Specified in the same + * unit as the system timestamp. The operation might block for a shorter or + * longer time even if the condition is not met. A value of UINT64_MAX indicates + * no maximum. + * + * @param[in] wait_state_hint Hint used by the application to indicate the + * preferred waiting state. The actual waiting state is ultimately decided by + * HSA runtime and may not match the provided hint. A value of + * ::HSA_WAIT_STATE_ACTIVE may improve the latency of response to a signal + * update by avoiding rescheduling overhead. + * + * @return Observed value of the signal, which might not satisfy the specified + * condition. + * +*/ +hsa_signal_value_t HSA_API hsa_signal_wait_scacquire( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @copydoc hsa_signal_wait_scacquire + */ +hsa_signal_value_t HSA_API hsa_signal_wait_relaxed( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @deprecated Renamed as ::hsa_signal_wait_scacquire. + * + * @copydoc hsa_signal_wait_scacquire + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_wait_acquire( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @brief Group of signals. + */ +typedef struct hsa_signal_group_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_signal_group_t; + +/** + * @brief Create a signal group. + * + * @param[in] num_signals Number of elements in @p signals. Must not be 0. + * + * @param[in] signals List of signals in the group. The list must not contain + * any repeated elements. Must not be NULL. + * + * @param[in] num_consumers Number of elements in @p consumers. Must not be 0. + * + * @param[in] consumers List of agents that might consume (wait on) the signal + * group. The list must not contain repeated elements, and must be a subset of + * the set of agents that are allowed to wait on all the signals in the + * group. If an agent not listed in @p consumers waits on the returned group, + * the behavior is undefined. The memory associated with @p consumers can be + * reused or freed after the function returns. Must not be NULL. + * + * @param[out] signal_group Pointer to newly created signal group. Must not be + * NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_signals is 0, @p signals + * is NULL, @p num_consumers is 0, @p consumers is NULL, or @p signal_group is + * NULL. + */ +hsa_status_t HSA_API hsa_signal_group_create( + uint32_t num_signals, + const hsa_signal_t *signals, + uint32_t num_consumers, + const hsa_agent_t *consumers, + hsa_signal_group_t *signal_group); + +/** + * @brief Destroy a signal group previous created by ::hsa_signal_group_create. + * + * @param[in] signal_group Signal group. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid. + */ +hsa_status_t HSA_API hsa_signal_group_destroy( + hsa_signal_group_t signal_group); + +/** + * @brief Wait until the value of at least one of the signals in a signal group + * satisfies its associated condition. + * + * @details The function is guaranteed to return if the value of at least one of + * the signals in the group satisfies its associated condition at some point in + * time during the wait, but the signal value returned to the application may no + * longer satisfy the condition. The application must ensure that signals in the + * group are used in such way that wait wakeup conditions are not invalidated + * before dependent threads have woken up. + * + * When this operation internally loads the value of the passed signal, it uses + * the memory order indicated in the function name. + * + * @param[in] signal_group Signal group. + * + * @param[in] conditions List of conditions. Each condition, and the value at + * the same index in @p compare_values, is used to compare the value of the + * signal at that index in @p signal_group (the signal passed by the application + * to ::hsa_signal_group_create at that particular index). The size of @p + * conditions must not be smaller than the number of signals in @p signal_group; + * any extra elements are ignored. Must not be NULL. + * + * @param[in] compare_values List of comparison values. The size of @p + * compare_values must not be smaller than the number of signals in @p + * signal_group; any extra elements are ignored. Must not be NULL. + * + * @param[in] wait_state_hint Hint used by the application to indicate the + * preferred waiting state. The actual waiting state is decided by the HSA runtime + * and may not match the provided hint. A value of ::HSA_WAIT_STATE_ACTIVE may + * improve the latency of response to a signal update by avoiding rescheduling + * overhead. + * + * @param[out] signal Signal in the group that satisfied the associated + * condition. If several signals satisfied their condition, the function can + * return any of those signals. Must not be NULL. + * + * @param[out] value Observed value for @p signal, which might no longer satisfy + * the specified condition. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p conditions is NULL, @p + * compare_values is NULL, @p signal is NULL, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_signal_group_wait_any_scacquire( + hsa_signal_group_t signal_group, + const hsa_signal_condition_t *conditions, + const hsa_signal_value_t *compare_values, + hsa_wait_state_t wait_state_hint, + hsa_signal_t *signal, + hsa_signal_value_t *value); + +/** + * @copydoc hsa_signal_group_wait_any_scacquire + */ +hsa_status_t HSA_API hsa_signal_group_wait_any_relaxed( + hsa_signal_group_t signal_group, + const hsa_signal_condition_t *conditions, + const hsa_signal_value_t *compare_values, + hsa_wait_state_t wait_state_hint, + hsa_signal_t *signal, + hsa_signal_value_t *value); + +/** @} */ + +/** \defgroup memory Memory + * @{ + */ + +/** + * @brief A memory region represents a block of virtual memory with certain + * properties. For example, the HSA runtime represents fine-grained memory in + * the global segment using a region. A region might be associated with more + * than one agent. + */ +typedef struct hsa_region_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_region_t; + +/** @} */ + + +/** \defgroup queue Queues + * @{ + */ + +/** + * @brief Queue type. Intended to be used for dynamic queue protocol + * determination. + */ +typedef enum { + /** + * Queue supports multiple producers. Use of multiproducer queue mechanics is + * required. + */ + HSA_QUEUE_TYPE_MULTI = 0, + /** + * Queue only supports a single producer. In some scenarios, the application + * may want to limit the submission of AQL packets to a single agent. Queues + * that support a single producer may be more efficient than queues supporting + * multiple producers. Use of multiproducer queue mechanics is not supported. + */ + HSA_QUEUE_TYPE_SINGLE = 1, + /** + * Queue supports multiple producers and cooperative dispatches. Cooperative + * dispatches are able to use GWS synchronization. Queues of this type may be + * limited in number. The runtime may return the same queue to serve multiple + * ::hsa_queue_create calls when this type is given. Callers must inspect the + * returned queue to discover queue size. Queues of this type are reference + * counted and require a matching number of ::hsa_queue_destroy calls to + * release. Use of multiproducer queue mechanics is required. See + * ::HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES to query agent support for this + * type. + */ + HSA_QUEUE_TYPE_COOPERATIVE = 2 +} hsa_queue_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_queue_type_t constants. + */ +typedef uint32_t hsa_queue_type32_t; + +/** + * @brief Queue features. + */ +typedef enum { + /** + * Queue supports kernel dispatch packets. + */ + HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1, + + /** + * Queue supports agent dispatch packets. + */ + HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2 +} hsa_queue_feature_t; + +/** + * @brief User mode queue. + * + * @details The queue structure is read-only and allocated by the HSA runtime, + * but agents can directly modify the contents of the buffer pointed by @a + * base_address, or use HSA runtime APIs to access the doorbell signal. + * + */ +typedef struct hsa_queue_s { + /** + * Queue type. + */ + hsa_queue_type32_t type; + + /** + * Queue features mask. This is a bit-field of ::hsa_queue_feature_t + * values. Applications should ignore any unknown set bits. + */ + uint32_t features; + +#ifdef HSA_LARGE_MODEL + void* base_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Starting address of the HSA runtime-allocated buffer used to store the AQL + * packets. Must be aligned to the size of an AQL packet. + */ + void* base_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; +#else + uint32_t reserved0; + void* base_address; +#endif + + /** + * Signal object used by the application to indicate the ID of a packet that + * is ready to be processed. The HSA runtime manages the doorbell signal. If + * the application tries to replace or destroy this signal, the behavior is + * undefined. + * + * If @a type is ::HSA_QUEUE_TYPE_SINGLE, the doorbell signal value must be + * updated in a monotonically increasing fashion. If @a type is + * ::HSA_QUEUE_TYPE_MULTI, the doorbell signal value can be updated with any + * value. + */ + hsa_signal_t doorbell_signal; + + /** + * Maximum number of packets the queue can hold. Must be a power of 2. + */ + uint32_t size; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + /** + * Queue identifier, which is unique over the lifetime of the application. + */ + uint64_t id; + +} hsa_queue_t; + +/** + * @brief Create a user mode queue. + * + * @details The HSA runtime creates the queue structure, the underlying packet + * buffer, the completion signal, and the write and read indexes. The initial + * value of the write and read indexes is 0. The type of every packet in the + * buffer is initialized to ::HSA_PACKET_TYPE_INVALID. + * + * The application should only rely on the error code returned to determine if + * the queue is valid. + * + * @param[in] agent Agent where to create the queue. + * + * @param[in] size Number of packets the queue is expected to + * hold. Must be a power of 2 between 1 and the value of + * ::HSA_AGENT_INFO_QUEUE_MAX_SIZE in @p agent. The size of the newly + * created queue is the maximum of @p size and the value of + * ::HSA_AGENT_INFO_QUEUE_MIN_SIZE in @p agent. + * + * @param[in] type Type of the queue, a bitwise OR of hsa_queue_type_t values. + * If the value of ::HSA_AGENT_INFO_QUEUE_TYPE in @p agent is ::HSA_QUEUE_TYPE_SINGLE, + * then @p type must also be ::HSA_QUEUE_TYPE_SINGLE. + * + * @param[in] callback Callback invoked by the HSA runtime for every + * asynchronous event related to the newly created queue. May be NULL. The HSA + * runtime passes three arguments to the callback: a code identifying the event + * that triggered the invocation, a pointer to the queue where the event + * originated, and the application data. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @param[in] private_segment_size Hint indicating the maximum + * expected private segment usage per work-item, in bytes. There may + * be performance degradation if the application places a kernel + * dispatch packet in the queue and the corresponding private segment + * usage exceeds @p private_segment_size. If the application does not + * want to specify any particular value for this argument, @p + * private_segment_size must be UINT32_MAX. If the queue does not + * support kernel dispatch packets, this argument is ignored. + * + * @param[in] group_segment_size Hint indicating the maximum expected + * group segment usage per work-group, in bytes. There may be + * performance degradation if the application places a kernel dispatch + * packet in the queue and the corresponding group segment usage + * exceeds @p group_segment_size. If the application does not want to + * specify any particular value for this argument, @p + * group_segment_size must be UINT32_MAX. If the queue does not + * support kernel dispatch packets, this argument is ignored. + * + * @param[out] queue Memory location where the HSA runtime stores a pointer to + * the newly created queue. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p agent does not + * support queues of the given type. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, + * @p size is 0, @p type is an invalid queue type, or @p queue is NULL. + * + */ +hsa_status_t HSA_API hsa_queue_create( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue); + +/** + * @brief Create a queue for which the application or a kernel is responsible + * for processing the AQL packets. + * + * @details The application can use this function to create queues where AQL + * packets are not parsed by the packet processor associated with an agent, + * but rather by a unit of execution running on that agent (for example, a + * thread in the host application). + * + * The application is responsible for ensuring that all the producers and + * consumers of the resulting queue can access the provided doorbell signal + * and memory region. The application is also responsible for ensuring that the + * unit of execution processing the queue packets supports the indicated + * features (AQL packet types). + * + * When the queue is created, the HSA runtime allocates the packet buffer using + * @p region, and the write and read indexes. The initial value of the write and + * read indexes is 0, and the type of every packet in the buffer is initialized + * to ::HSA_PACKET_TYPE_INVALID. The value of the @e size, @e type, @e features, + * and @e doorbell_signal fields in the returned queue match the values passed + * by the application. + * + * @param[in] region Memory region that the HSA runtime should use to allocate + * the AQL packet buffer and any other queue metadata. + * + * @param[in] size Number of packets the queue is expected to hold. Must be a + * power of 2 greater than 0. + * + * @param[in] type Queue type. + * + * @param[in] features Supported queue features. This is a bit-field of + * ::hsa_queue_feature_t values. + * + * @param[in] doorbell_signal Doorbell signal that the HSA runtime must + * associate with the returned queue. The signal handle must not be 0. + * + * @param[out] queue Memory location where the HSA runtime stores a pointer to + * the newly created queue. The application should not rely on the value + * returned for this argument but only in the status code to determine if the + * queue is valid. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, @p + * size is 0, @p type is an invalid queue type, the doorbell signal handle is + * 0, or @p queue is NULL. + * + */ +hsa_status_t HSA_API hsa_soft_queue_create( + hsa_region_t region, + uint32_t size, + hsa_queue_type32_t type, + uint32_t features, + hsa_signal_t doorbell_signal, + hsa_queue_t **queue); + +/** + * @brief Destroy a user mode queue. + * + * @details When a queue is destroyed, the state of the AQL packets that have + * not been yet fully processed (their completion phase has not finished) + * becomes undefined. It is the responsibility of the application to ensure that + * all pending queue operations are finished if their results are required. + * + * The resources allocated by the HSA runtime during queue creation (queue + * structure, ring buffer, doorbell signal) are released. The queue should not + * be accessed after being destroyed. + * + * @param[in] queue Pointer to a queue created using ::hsa_queue_create. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API hsa_queue_destroy( + hsa_queue_t *queue); + +/** + * @brief Inactivate a queue. + * + * @details Inactivating the queue aborts any pending executions and prevent any + * new packets from being processed. Any more packets written to the queue once + * it is inactivated will be ignored by the packet processor. + * + * @param[in] queue Pointer to a queue. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API hsa_queue_inactivate( + hsa_queue_t *queue); + +/** + * @deprecated Renamed as ::hsa_queue_load_read_index_scacquire. + * + * @copydoc hsa_queue_load_read_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_read_index_acquire( + const hsa_queue_t *queue); + +/** + * @brief Atomically load the read index of a queue. + * + * @param[in] queue Pointer to a queue. + * + * @return Read index of the queue pointed by @p queue. + */ +uint64_t HSA_API hsa_queue_load_read_index_scacquire( + const hsa_queue_t *queue); + +/** + * @copydoc hsa_queue_load_read_index_scacquire + */ +uint64_t HSA_API hsa_queue_load_read_index_relaxed( + const hsa_queue_t *queue); + +/** + * @deprecated Renamed as ::hsa_queue_load_write_index_scacquire. + * + * @copydoc hsa_queue_load_write_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_write_index_acquire( + const hsa_queue_t *queue); + +/** + * @brief Atomically load the write index of a queue. + * + * @param[in] queue Pointer to a queue. + * + * @return Write index of the queue pointed by @p queue. + */ +uint64_t HSA_API hsa_queue_load_write_index_scacquire( + const hsa_queue_t *queue); + +/** + * @copydoc hsa_queue_load_write_index_scacquire + */ +uint64_t HSA_API hsa_queue_load_write_index_relaxed( + const hsa_queue_t *queue); + +/** + * @brief Atomically set the write index of a queue. + * + * @details It is recommended that the application uses this function to update + * the write index when there is a single agent submitting work to the queue + * (the queue type is ::HSA_QUEUE_TYPE_SINGLE). + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to assign to the write index. + * + */ +void HSA_API hsa_queue_store_write_index_relaxed( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_store_write_index_screlease. + * + * @copydoc hsa_queue_store_write_index_screlease + */ +void HSA_API HSA_DEPRECATED hsa_queue_store_write_index_release( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_store_write_index_relaxed + */ +void HSA_API hsa_queue_store_write_index_screlease( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_cas_write_index_scacq_screl. + * + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acq_rel( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @brief Atomically set the write index of a queue if the observed value is + * equal to the expected value. The application can inspect the returned value + * to determine if the replacement was done. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] expected Expected value. + * + * @param[in] value Value to assign to the write index if @p expected matches + * the observed write index. Must be greater than @p expected. + * + * @return Previous value of the write index. + */ +uint64_t HSA_API hsa_queue_cas_write_index_scacq_screl( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_cas_write_index_scacquire. + * + * @copydoc hsa_queue_cas_write_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acquire( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_cas_write_index_scacquire( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_cas_write_index_relaxed( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_cas_write_index_screlease. + * + * @copydoc hsa_queue_cas_write_index_screlease + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_release( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_cas_write_index_screlease( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_add_write_index_scacq_screl. + * + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acq_rel( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @brief Atomically increment the write index of a queue by an offset. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to add to the write index. + * + * @return Previous value of the write index. + */ +uint64_t HSA_API hsa_queue_add_write_index_scacq_screl( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_add_write_index_scacquire. + * + * @copydoc hsa_queue_add_write_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acquire( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_add_write_index_scacquire( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_add_write_index_relaxed( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_add_write_index_screlease. + * + * @copydoc hsa_queue_add_write_index_screlease + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_release( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_add_write_index_screlease( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @brief Atomically set the read index of a queue. + * + * @details Modifications of the read index are not allowed and result in + * undefined behavior if the queue is associated with an agent for which + * only the corresponding packet processor is permitted to update the read + * index. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to assign to the read index. + * + */ +void HSA_API hsa_queue_store_read_index_relaxed( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_store_read_index_screlease. + * + * @copydoc hsa_queue_store_read_index_screlease + */ +void HSA_API HSA_DEPRECATED hsa_queue_store_read_index_release( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_store_read_index_relaxed + */ +void HSA_API hsa_queue_store_read_index_screlease( + const hsa_queue_t *queue, + uint64_t value); +/** @} */ + + +/** \defgroup aql Architected Queuing Language + * @{ + */ + +/** + * @brief Packet type. + */ +typedef enum { + /** + * Vendor-specific packet. + */ + HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0, + /** + * The packet has been processed in the past, but has not been reassigned to + * the packet processor. A packet processor must not process a packet of this + * type. All queues support this packet type. + */ + HSA_PACKET_TYPE_INVALID = 1, + /** + * Packet used by agents for dispatching jobs to kernel agents. Not all + * queues support packets of this type (see ::hsa_queue_feature_t). + */ + HSA_PACKET_TYPE_KERNEL_DISPATCH = 2, + /** + * Packet used by agents to delay processing of subsequent packets, and to + * express complex dependencies between multiple packets. All queues support + * this packet type. + */ + HSA_PACKET_TYPE_BARRIER_AND = 3, + /** + * Packet used by agents for dispatching jobs to agents. Not all + * queues support packets of this type (see ::hsa_queue_feature_t). + */ + HSA_PACKET_TYPE_AGENT_DISPATCH = 4, + /** + * Packet used by agents to delay processing of subsequent packets, and to + * express complex dependencies between multiple packets. All queues support + * this packet type. + */ + HSA_PACKET_TYPE_BARRIER_OR = 5 +} hsa_packet_type_t; + +/** + * @brief Scope of the memory fence operation associated with a packet. + */ +typedef enum { + /** + * No scope (no fence is applied). The packet relies on external fences to + * ensure visibility of memory updates. + */ + HSA_FENCE_SCOPE_NONE = 0, + /** + * The fence is applied with agent scope for the global segment. + */ + HSA_FENCE_SCOPE_AGENT = 1, + /** + * The fence is applied across both agent and system scope for the global + * segment. + */ + HSA_FENCE_SCOPE_SYSTEM = 2 +} hsa_fence_scope_t; + +/** + * @brief Sub-fields of the @a header field that is present in any AQL + * packet. The offset (with respect to the address of @a header) of a sub-field + * is identical to its enumeration constant. The width of each sub-field is + * determined by the corresponding value in ::hsa_packet_header_width_t. The + * offset and the width are expressed in bits. + */ + typedef enum { + /** + * Packet type. The value of this sub-field must be one of + * ::hsa_packet_type_t. If the type is ::HSA_PACKET_TYPE_VENDOR_SPECIFIC, the + * packet layout is vendor-specific. + */ + HSA_PACKET_HEADER_TYPE = 0, + /** + * Barrier bit. If the barrier bit is set, the processing of the current + * packet only launches when all preceding packets (within the same queue) are + * complete. + */ + HSA_PACKET_HEADER_BARRIER = 8, + /** + * Acquire fence scope. The value of this sub-field determines the scope and + * type of the memory fence operation applied before the packet enters the + * active phase. An acquire fence ensures that any subsequent global segment + * or image loads by any unit of execution that belongs to a dispatch that has + * not yet entered the active phase on any queue of the same kernel agent, + * sees any data previously released at the scopes specified by the acquire + * fence. The value of this sub-field must be one of ::hsa_fence_scope_t. + */ + HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE = 9, + /** + * @deprecated Renamed as ::HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9, + /** + * Release fence scope, The value of this sub-field determines the scope and + * type of the memory fence operation applied after kernel completion but + * before the packet is completed. A release fence makes any global segment or + * image data that was stored by any unit of execution that belonged to a + * dispatch that has completed the active phase on any queue of the same + * kernel agent visible in all the scopes specified by the release fence. The + * value of this sub-field must be one of ::hsa_fence_scope_t. + */ + HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE = 11, + /** + * @deprecated Renamed as ::HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11 + } hsa_packet_header_t; + +/** + * @brief Width (in bits) of the sub-fields in ::hsa_packet_header_t. + */ + typedef enum { + HSA_PACKET_HEADER_WIDTH_TYPE = 8, + HSA_PACKET_HEADER_WIDTH_BARRIER = 1, + HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE = 2, + /** + * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_WIDTH_ACQUIRE_FENCE_SCOPE = 2, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE = 2, + /** + * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_WIDTH_RELEASE_FENCE_SCOPE = 2 + } hsa_packet_header_width_t; + +/** + * @brief Sub-fields of the kernel dispatch packet @a setup field. The offset + * (with respect to the address of @a setup) of a sub-field is identical to its + * enumeration constant. The width of each sub-field is determined by the + * corresponding value in ::hsa_kernel_dispatch_packet_setup_width_t. The + * offset and the width are expressed in bits. + */ + typedef enum { + /** + * Number of dimensions of the grid. Valid values are 1, 2, or 3. + * + */ + HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0 + } hsa_kernel_dispatch_packet_setup_t; + +/** + * @brief Width (in bits) of the sub-fields in + * ::hsa_kernel_dispatch_packet_setup_t. + */ + typedef enum { + HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS = 2 + } hsa_kernel_dispatch_packet_setup_width_t; + +/** + * @brief AQL kernel dispatch packet + */ +typedef struct hsa_kernel_dispatch_packet_s { + union { + struct { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Dispatch setup parameters. Used to configure kernel dispatch parameters + * such as the number of dimensions in the grid. The parameters are described + * by ::hsa_kernel_dispatch_packet_setup_t. + */ + uint16_t setup; + }; + uint32_t full_header; + }; + + /** + * X dimension of work-group, in work-items. Must be greater than 0. + */ + uint16_t workgroup_size_x; + + /** + * Y dimension of work-group, in work-items. Must be greater than + * 0. If the grid has 1 dimension, the only valid value is 1. + */ + uint16_t workgroup_size_y; + + /** + * Z dimension of work-group, in work-items. Must be greater than + * 0. If the grid has 1 or 2 dimensions, the only valid value is 1. + */ + uint16_t workgroup_size_z; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * X dimension of grid, in work-items. Must be greater than 0. Must + * not be smaller than @a workgroup_size_x. + */ + uint32_t grid_size_x; + + /** + * Y dimension of grid, in work-items. Must be greater than 0. If the grid has + * 1 dimension, the only valid value is 1. Must not be smaller than @a + * workgroup_size_y. + */ + uint32_t grid_size_y; + + /** + * Z dimension of grid, in work-items. Must be greater than 0. If the grid has + * 1 or 2 dimensions, the only valid value is 1. Must not be smaller than @a + * workgroup_size_z. + */ + uint32_t grid_size_z; + + /** + * Size in bytes of private memory allocation request (per work-item). + */ + uint32_t private_segment_size; + + /** + * Size in bytes of group memory allocation request (per work-group). Must not + * be less than the sum of the group memory used by the kernel (and the + * functions it calls directly or indirectly) and the dynamically allocated + * group segment variables. + */ + uint32_t group_segment_size; + + /** + * Opaque handle to a code object that includes an implementation-defined + * executable code for the kernel. + */ + uint64_t kernel_object; + +#ifdef HSA_LARGE_MODEL + void* kernarg_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Pointer to a buffer containing the kernel arguments. May be NULL. + * + * The buffer must be allocated using ::hsa_memory_allocate, and must not be + * modified once the kernel dispatch packet is enqueued until the dispatch has + * completed execution. + */ + void* kernarg_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; +#else + uint32_t reserved1; + void* kernarg_address; +#endif + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_kernel_dispatch_packet_t; + +/** + * @brief Agent dispatch packet. + */ +typedef struct hsa_agent_dispatch_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Application-defined function to be performed by the destination agent. + */ + uint16_t type; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; + +#ifdef HSA_LARGE_MODEL + void* return_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Address where to store the function return values, if any. + */ + void* return_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; +#else + uint32_t reserved1; + void* return_address; +#endif + + /** + * Function arguments. + */ + uint64_t arg[4]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_agent_dispatch_packet_t; + +/** + * @brief Barrier-AND packet. + */ +typedef struct hsa_barrier_and_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Array of dependent signal objects. Signals with a handle value of 0 are + * allowed and are interpreted by the packet processor as satisfied + * dependencies. + */ + hsa_signal_t dep_signal[5]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_barrier_and_packet_t; + +/** + * @brief Barrier-OR packet. + */ +typedef struct hsa_barrier_or_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Array of dependent signal objects. Signals with a handle value of 0 are + * allowed and are interpreted by the packet processor as dependencies not + * satisfied. + */ + hsa_signal_t dep_signal[5]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_barrier_or_packet_t; + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Memory segments associated with a region. + */ +typedef enum { + /** + * Global segment. Used to hold data that is shared by all agents. + */ + HSA_REGION_SEGMENT_GLOBAL = 0, + /** + * Read-only segment. Used to hold data that remains constant during the + * execution of a kernel. + */ + HSA_REGION_SEGMENT_READONLY = 1, + /** + * Private segment. Used to hold data that is local to a single work-item. + */ + HSA_REGION_SEGMENT_PRIVATE = 2, + /** + * Group segment. Used to hold data that is shared by the work-items of a + * work-group. + */ + HSA_REGION_SEGMENT_GROUP = 3, + /** + * Kernarg segment. Used to store kernel arguments. + */ + HSA_REGION_SEGMENT_KERNARG = 4 +} hsa_region_segment_t; + +/** + * @brief Global region flags. + */ +typedef enum { + /** + * The application can use memory in the region to store kernel arguments, and + * provide the values for the kernarg segment of a kernel dispatch. If this + * flag is set, then ::HSA_REGION_GLOBAL_FLAG_FINE_GRAINED must be set. + */ + HSA_REGION_GLOBAL_FLAG_KERNARG = 1, + /** + * Updates to memory in this region are immediately visible to all the + * agents under the terms of the HSA memory model. If this + * flag is set, then ::HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED must not be set. + */ + HSA_REGION_GLOBAL_FLAG_FINE_GRAINED = 2, + /** + * Updates to memory in this region can be performed by a single agent at + * a time. If a different agent in the system is allowed to access the + * region, the application must explicitely invoke ::hsa_memory_assign_agent + * in order to transfer ownership to that agent for a particular buffer. + */ + HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4, + + /** + * Updates to memory in this region have extended scope, where the device-scope atomics + * to this memory type act as system-scope with respect to all variables located in + * memory regions of this type. + * Note: On non-compliant systems, the application may still be responsible for performing + * device-specific actions necessary to achieve system-scope coherence. + */ + HSA_REGION_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8 +} hsa_region_global_flag_t; + +/** + * @brief Attributes of a memory region. + */ + +#ifdef __cplusplus +typedef enum : int { +#else +typedef enum { +#endif + /** + * Segment where memory in the region can be used. The type of this + * attribute is ::hsa_region_segment_t. + */ + HSA_REGION_INFO_SEGMENT = 0, + /** + * Flag mask. The value of this attribute is undefined if the value of + * ::HSA_REGION_INFO_SEGMENT is not ::HSA_REGION_SEGMENT_GLOBAL. The type of + * this attribute is uint32_t, a bit-field of ::hsa_region_global_flag_t + * values. + */ + HSA_REGION_INFO_GLOBAL_FLAGS = 1, + /** + * Size of this region, in bytes. The type of this attribute is size_t. + */ + HSA_REGION_INFO_SIZE = 2, + /** + * Maximum allocation size in this region, in bytes. Must not exceed the value + * of ::HSA_REGION_INFO_SIZE. The type of this attribute is size_t. + * + * If the region is in the global or readonly segments, this is the maximum + * size that the application can pass to ::hsa_memory_allocate. + * + * If the region is in the group segment, this is the maximum size (per + * work-group) that can be requested for a given kernel dispatch. If the + * region is in the private segment, this is the maximum size (per work-item) + * that can be requested for a specific kernel dispatch, and must be at least + * 256 bytes. + */ + HSA_REGION_INFO_ALLOC_MAX_SIZE = 4, + /** + * Maximum size (per work-group) of private memory that can be requested for a + * specific kernel dispatch. Must be at least 65536 bytes. The type of this + * attribute is uint32_t. The value of this attribute is undefined if the + * region is not in the private segment. + */ + HSA_REGION_INFO_ALLOC_MAX_PRIVATE_WORKGROUP_SIZE = 8, + /** + * Indicates whether memory in this region can be allocated using + * ::hsa_memory_allocate. The type of this attribute is bool. + * + * The value of this flag is always false for regions in the group and private + * segments. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED = 5, + /** + * Allocation granularity of buffers allocated by ::hsa_memory_allocate in + * this region. The size of a buffer allocated in this region is a multiple of + * the value of this attribute. The value of this attribute is only defined if + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region. The type + * of this attribute is size_t. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE = 6, + /** + * Alignment of buffers allocated by ::hsa_memory_allocate in this region. The + * value of this attribute is only defined if + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region, and must be + * a power of 2. The type of this attribute is size_t. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT = 7 +} hsa_region_info_t; + +/** + * @brief Get the current value of an attribute of a region. + * + * @param[in] region A valid region. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * region attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_region_get_info( + hsa_region_t region, + hsa_region_info_t attribute, + void* value); + +/** + * @brief Iterate over the memory regions associated with a given agent, and + * invoke an application-defined callback on every iteration. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per region that is + * accessible from the agent. The HSA runtime passes two arguments to the + * callback, the region and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and ::hsa_agent_iterate_regions returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_regions( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data); + +/** + * @brief Allocate a block of memory in a given region. + * + * @param[in] region Region where to allocate memory from. The region must have + * the ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED flag set. + * + * @param[in] size Allocation size, in bytes. Must not be zero. This value is + * rounded up to the nearest multiple of ::HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE + * in @p region. + * + * @param[out] ptr Pointer to the location where to store the base address of + * the allocated block. The returned base address is aligned to the value of + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT in @p region. If the allocation + * fails, the returned value is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to + * allocate memory in @p region, or @p size is greater than the value of + * HSA_REGION_INFO_ALLOC_MAX_SIZE in @p region. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0. + */ +hsa_status_t HSA_API hsa_memory_allocate(hsa_region_t region, + size_t size, + void** ptr); + +/** + * @brief Deallocate a block of memory previously allocated using + * ::hsa_memory_allocate. + * + * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value + * previously returned by ::hsa_memory_allocate, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + */ +hsa_status_t HSA_API hsa_memory_free(void* ptr); + +/** + * @brief Copy a block of memory from the location pointed to by @p src to the + * memory block pointed to by @p dst. + * + * @param[out] dst Buffer where the content is to be copied. If @p dst is in + * coarse-grained memory, the copied data is only visible to the agent currently + * assigned (::hsa_memory_assign_agent) to @p dst. + * + * @param[in] src A valid pointer to the source of data to be copied. The source + * buffer must not overlap with the destination buffer. If the source buffer is + * in coarse-grained memory then it must be assigned to an agent, from which the + * data will be retrieved. + * + * @param[in] size Number of bytes to copy. If @p size is 0, no copy is + * performed and the function returns success. Copying a number of bytes larger + * than the size of the buffers pointed by @p dst or @p src results in undefined + * behavior. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination + * pointers are NULL. + */ +hsa_status_t HSA_API hsa_memory_copy( + void *dst, + const void *src, + size_t size); + +/** + * @brief Change the ownership of a global, coarse-grained buffer. + * + * @details The contents of a coarse-grained buffer are visible to an agent + * only after ownership has been explicitely transferred to that agent. Once the + * operation completes, the previous owner cannot longer access the data in the + * buffer. + * + * An implementation of the HSA runtime is allowed, but not required, to change + * the physical location of the buffer when ownership is transferred to a + * different agent. In general the application must not assume this + * behavior. The virtual location (address) of the passed buffer is never + * modified. + * + * @param[in] ptr Base address of a global buffer. The pointer must match an + * address previously returned by ::hsa_memory_allocate. The size of the buffer + * affected by the ownership change is identical to the size of that previous + * allocation. If @p ptr points to a fine-grained global buffer, no operation is + * performed and the function returns success. If @p ptr does not point to + * global memory, the behavior is undefined. + * + * @param[in] agent Agent that becomes the owner of the buffer. The + * application is responsible for ensuring that @p agent has access to the + * region that contains the buffer. It is allowed to change ownership to an + * agent that is already the owner of the buffer, with the same or different + * access permissions. + * + * @param[in] access Access permissions requested for the new owner. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p access is + * not a valid access value. + */ +hsa_status_t HSA_API hsa_memory_assign_agent( + void *ptr, + hsa_agent_t agent, + hsa_access_permission_t access); + +/** + * + * @brief Register a global, fine-grained buffer. + * + * @details Registering a buffer serves as an indication to the HSA runtime that + * the memory might be accessed from a kernel agent other than the + * host. Registration is a performance hint that allows the HSA runtime + * implementation to know which buffers will be accessed by some of the kernel + * agents ahead of time. + * + * Registration is only recommended for buffers in the global segment that have + * not been allocated using the HSA allocator (::hsa_memory_allocate), but an OS + * allocator instead. Registering an OS-allocated buffer in the base profile is + * equivalent to a no-op. + * + * Registrations should not overlap. + * + * @param[in] ptr A buffer in global, fine-grained memory. If a NULL pointer is + * passed, no operation is performed. If the buffer has been allocated using + * ::hsa_memory_allocate, or has already been registered, no operation is + * performed. + * + * @param[in] size Requested registration size in bytes. A size of 0 is + * only allowed if @p ptr is NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 but @p ptr + * is not NULL. + */ +hsa_status_t HSA_API hsa_memory_register( + void *ptr, + size_t size); + +/** + * + * @brief Deregister memory previously registered using ::hsa_memory_register. + * + * @details If the memory interval being deregistered does not match a previous + * registration (start and end addresses), the behavior is undefined. + * + * @param[in] ptr A pointer to the base of the buffer to be deregistered. If + * a NULL pointer is passed, no operation is performed. + * + * @param[in] size Size of the buffer to be deregistered. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_memory_deregister( + void *ptr, + size_t size); + +/** @} */ + + +/** \defgroup instruction-set-architecture Instruction Set Architecture. + * @{ + */ + +/** + * @brief Instruction set architecture. + */ +typedef struct hsa_isa_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_isa_t; + +/** + * @brief Retrieve a reference to an instruction set architecture handle out of + * a symbolic name. + * + * @param[in] name Vendor-specific name associated with a a particular + * instruction set architecture. @p name must start with the vendor name and a + * colon (for example, "AMD:"). The rest of the name is vendor-specific. Must be + * a NUL-terminated string. + * + * @param[out] isa Memory location where the HSA runtime stores the ISA handle + * corresponding to the given name. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA_NAME The given name does not + * correspond to any instruction set architecture. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p name is NULL, or @p isa is + * NULL. + */ +hsa_status_t HSA_API hsa_isa_from_name( + const char *name, + hsa_isa_t *isa); + +/** + * @brief Iterate over the instruction sets supported by the given agent, and + * invoke an application-defined callback on every iteration. The iterator is + * deterministic: if an agent supports several instruction set architectures, + * they are traversed in the same order in every invocation of this function. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per instruction set + * architecture. The HSA runtime passes two arguments to the callback: the + * ISA and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * that status value is returned. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_isas( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_isa_t isa, void *data), + void *data); + +/** + * @brief Instruction set architecture attributes. + */ +typedef enum { + /** + * The length of the ISA name in bytes, not including the NUL terminator. The + * type of this attribute is uint32_t. + */ + HSA_ISA_INFO_NAME_LENGTH = 0, + /** + * Human-readable description. The type of this attribute is character array + * with the length equal to the value of ::HSA_ISA_INFO_NAME_LENGTH attribute. + */ + HSA_ISA_INFO_NAME = 1, + /** + * @deprecated + * + * Number of call conventions supported by the instruction set architecture. + * Must be greater than zero. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_COUNT = 2, + /** + * @deprecated + * + * Number of work-items in a wavefront for a given call convention. Must be a + * power of 2 in the range [1,256]. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONT_SIZE = 3, + /** + * @deprecated + * + * Number of wavefronts per compute unit for a given call convention. In + * practice, other factors (for example, the amount of group memory used by a + * work-group) may further limit the number of wavefronts per compute + * unit. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONTS_PER_COMPUTE_UNIT = 4, + /** + * Machine models supported by the instruction set architecture. The type of + * this attribute is a bool[2]. If the ISA supports the small machine model, + * the element at index ::HSA_MACHINE_MODEL_SMALL is true. If the ISA supports + * the large model, the element at index ::HSA_MACHINE_MODEL_LARGE is true. + */ + HSA_ISA_INFO_MACHINE_MODELS = 5, + /** + * Profiles supported by the instruction set architecture. The type of this + * attribute is a bool[2]. If the ISA supports the base profile, the element + * at index ::HSA_PROFILE_BASE is true. If the ISA supports the full profile, + * the element at index ::HSA_PROFILE_FULL is true. + */ + HSA_ISA_INFO_PROFILES = 6, + /** + * Default floating-point rounding modes supported by the instruction set + * architecture. The type of this attribute is a bool[3]. The value at a given + * index is true if the corresponding rounding mode in + * ::hsa_default_float_rounding_mode_t is supported. At least one default mode + * has to be supported. + * + * If the default mode is supported, then + * ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES must report that + * both the zero and the near roundings modes are supported. + */ + HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES = 7, + /** + * Default floating-point rounding modes supported by the instruction set + * architecture in the Base profile. The type of this attribute is a + * bool[3]. The value at a given index is true if the corresponding rounding + * mode in ::hsa_default_float_rounding_mode_t is supported. The value at + * index HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT must be false. At least one + * of the values at indexes ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO or + * HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR must be true. + */ + HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 8, + /** + * Flag indicating that the f16 HSAIL operation is at least as fast as the + * f32 operation in the instruction set architecture. The type of this + * attribute is bool. + */ + HSA_ISA_INFO_FAST_F16_OPERATION = 9, + /** + * Maximum number of work-items of each dimension of a work-group. Each + * maximum must be greater than 0. No maximum can exceed the value of + * ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE. The type of this attribute is + * uint16_t[3]. + */ + HSA_ISA_INFO_WORKGROUP_MAX_DIM = 12, + /** + * Maximum total number of work-items in a work-group. The type + * of this attribute is uint32_t. + */ + HSA_ISA_INFO_WORKGROUP_MAX_SIZE = 13, + /** + * Maximum number of work-items of each dimension of a grid. Each maximum must + * be greater than 0, and must not be smaller than the corresponding value in + * ::HSA_ISA_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of + * ::HSA_ISA_INFO_GRID_MAX_SIZE. The type of this attribute is + * ::hsa_dim3_t. + */ + HSA_ISA_INFO_GRID_MAX_DIM = 14, + /** + * Maximum total number of work-items in a grid. The type of this + * attribute is uint64_t. + */ + HSA_ISA_INFO_GRID_MAX_SIZE = 16, + /** + * Maximum number of fbarriers per work-group. Must be at least 32. The + * type of this attribute is uint32_t. + */ + HSA_ISA_INFO_FBARRIER_MAX_SIZE = 17 +} hsa_isa_info_t; + +/** + * @deprecated The concept of call convention has been deprecated. If the + * application wants to query the value of an attribute for a given instruction + * set architecture, use ::hsa_isa_get_info_alt instead. If the application + * wants to query an attribute that is specific to a given combination of ISA + * and wavefront, use ::hsa_wavefront_get_info. + * + * @brief Get the current value of an attribute for a given instruction set + * architecture (ISA). + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] attribute Attribute to query. + * + * @param[in] index Call convention index. Used only for call convention + * attributes, otherwise ignored. Must have a value between 0 (inclusive) and + * the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT (not + * inclusive) in @p isa. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_INDEX The index is out of range. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is + * NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_get_info( + hsa_isa_t isa, + hsa_isa_info_t attribute, + uint32_t index, + void *value); + +/** + * @brief Get the current value of an attribute for a given instruction set + * architecture (ISA). + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is + * NULL. + */ +hsa_status_t HSA_API hsa_isa_get_info_alt( + hsa_isa_t isa, + hsa_isa_info_t attribute, + void *value); + +/** + * @brief Retrieve the exception policy support for a given combination of + * instruction set architecture and profile. + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] profile Profile. + * + * @param[out] mask Pointer to a memory location where the HSA runtime stores a + * mask of ::hsa_exception_policy_t values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid + * profile, or @p mask is NULL. + */ +hsa_status_t HSA_API hsa_isa_get_exception_policies( + hsa_isa_t isa, + hsa_profile_t profile, + uint16_t *mask); + +/** + * @brief Floating-point types. + */ +typedef enum { + /** + * 16-bit floating-point type. + */ + HSA_FP_TYPE_16 = 1, + /** + * 32-bit floating-point type. + */ + HSA_FP_TYPE_32 = 2, + /** + * 64-bit floating-point type. + */ + HSA_FP_TYPE_64 = 4 +} hsa_fp_type_t; + +/** + * @brief Flush to zero modes. + */ +typedef enum { + /** + * Flush to zero. + */ + HSA_FLUSH_MODE_FTZ = 1, + /** + * Do not flush to zero. + */ + HSA_FLUSH_MODE_NON_FTZ = 2 +} hsa_flush_mode_t; + +/** + * @brief Round methods. + */ +typedef enum { + /** + * Single round method. + */ + HSA_ROUND_METHOD_SINGLE = 1, + /** + * Double round method. + */ + HSA_ROUND_METHOD_DOUBLE = 2 +} hsa_round_method_t; + +/** + * @brief Retrieve the round method (single or double) used to implement the + * floating-point multiply add instruction (mad) for a given combination of + * instruction set architecture, floating-point type, and flush to zero + * modifier. + * + * @param[in] isa Instruction set architecture. + * + * @param[in] fp_type Floating-point type. + * + * @param[in] flush_mode Flush to zero modifier. + * + * @param[out] round_method Pointer to a memory location where the HSA + * runtime stores the round method used by the implementation. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fp_type is not a valid + * floating-point type, or @p flush_mode is not a valid flush to zero modifier, + * or @p round_method is NULL. + */ +hsa_status_t HSA_API hsa_isa_get_round_method( + hsa_isa_t isa, + hsa_fp_type_t fp_type, + hsa_flush_mode_t flush_mode, + hsa_round_method_t *round_method); + +/** + * @brief Wavefront handle + */ +typedef struct hsa_wavefront_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_wavefront_t; + +/** + * @brief Wavefront attributes. + */ +typedef enum { + /** + * Number of work-items in the wavefront. Must be a power of 2 in the range + * [1,256]. The type of this attribute is uint32_t. + */ + HSA_WAVEFRONT_INFO_SIZE = 0 +} hsa_wavefront_info_t; + +/** + * @brief Get the current value of a wavefront attribute. + * + * @param[in] wavefront A wavefront. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_WAVEFRONT The wavefront is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * wavefront attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_wavefront_get_info( + hsa_wavefront_t wavefront, + hsa_wavefront_info_t attribute, + void *value); + +/** + * @brief Iterate over the different wavefronts supported by an instruction set + * architecture, and invoke an application-defined callback on every iteration. + * + * @param[in] isa Instruction set architecture. + * + * @param[in] callback Callback to be invoked once per wavefront that is + * supported by the agent. The HSA runtime passes two arguments to the callback: + * the wavefront handle and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and that value is returned. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_isa_iterate_wavefronts( + hsa_isa_t isa, + hsa_status_t (*callback)(hsa_wavefront_t wavefront, void *data), + void *data); + +/** + * @deprecated Use ::hsa_agent_iterate_isas to query which instructions set + * architectures are supported by a given agent. + * + * @brief Check if the instruction set architecture of a code object can be + * executed on an agent associated with another architecture. + * + * @param[in] code_object_isa Instruction set architecture associated with a + * code object. + * + * @param[in] agent_isa Instruction set architecture associated with an agent. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. If the two architectures are compatible, the result + * is true; if they are incompatible, the result is false. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p code_object_isa or @p agent_isa are + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_compatible( + hsa_isa_t code_object_isa, + hsa_isa_t agent_isa, + bool *result); + +/** @} */ + + +/** \defgroup executable Executable + * @{ + */ + +/** + * @brief Code object reader handle. A code object reader is used to + * load a code object from file (when created using + * ::hsa_code_object_reader_create_from_file), or from memory (if created using + * ::hsa_code_object_reader_create_from_memory). + */ +typedef struct hsa_code_object_reader_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_code_object_reader_t; + +/** + * @brief Create a code object reader to operate on a file. + * + * @param[in] file File descriptor. The file must have been opened by + * application with at least read permissions prior calling this function. The + * file must contain a vendor-specific code object. + * + * The file is owned and managed by the application; the lifetime of the file + * descriptor must exceed that of any associated code object reader. + * + * @param[out] code_object_reader Memory location to store the newly created + * code object reader handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL. + */ +hsa_status_t HSA_API hsa_code_object_reader_create_from_file( + hsa_file_t file, + hsa_code_object_reader_t *code_object_reader); + +/** + * @brief Create a code object reader to operate on memory. + * + * @param[in] code_object Memory buffer that contains a vendor-specific code + * object. The buffer is owned and managed by the application; the lifetime of + * the buffer must exceed that of any associated code object reader. + * + * @param[in] size Size of the buffer pointed to by @p code_object. Must not be + * 0. + * + * @param[out] code_object_reader Memory location to store newly created code + * object reader handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object is NULL, @p size + * is zero, or @p code_object_reader is NULL. + */ +hsa_status_t HSA_API hsa_code_object_reader_create_from_memory( + const void *code_object, + size_t size, + hsa_code_object_reader_t *code_object_reader); + +/** + * @brief Destroy a code object reader. + * + * @details The code object reader handle becomes invalid after completion of + * this function. Any file or memory used to create the code object read is not + * closed, removed, or deallocated by this function. + * + * @param[in] code_object_reader Code object reader to destroy. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader + * is invalid. + */ +hsa_status_t HSA_API hsa_code_object_reader_destroy( + hsa_code_object_reader_t code_object_reader); + +/** + * @brief Struct containing an opaque handle to an executable, which contains + * ISA for finalized kernels and indirect functions together with the allocated + * global or readonly segment variables they reference. + */ +typedef struct hsa_executable_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_executable_t; + +/** + * @brief Executable state. + */ +typedef enum { + /** + * Executable state, which allows the user to load code objects and define + * external variables. Variable addresses, kernel code handles, and + * indirect function code handles are not available in query operations until + * the executable is frozen (zero always returned). + */ + HSA_EXECUTABLE_STATE_UNFROZEN = 0, + /** + * Executable state, which allows the user to query variable addresses, + * kernel code handles, and indirect function code handles using query + * operations. Loading new code objects, as well as defining external + * variables, is not allowed in this state. + */ + HSA_EXECUTABLE_STATE_FROZEN = 1 +} hsa_executable_state_t; + +/** + * @deprecated Use ::hsa_executable_create_alt instead, which allows the + * application to specify the default floating-point rounding mode of the + * executable and assumes an unfrozen initial state. + * + * @brief Create an empty executable. + * + * @param[in] profile Profile used in the executable. + * + * @param[in] executable_state Executable state. If the state is + * ::HSA_EXECUTABLE_STATE_FROZEN, the resulting executable is useless because no + * code objects can be loaded, and no variables can be defined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] executable Memory location where the HSA runtime stores the newly + * created executable handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or + * @p executable is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_create( + hsa_profile_t profile, + hsa_executable_state_t executable_state, + const char *options, + hsa_executable_t *executable); + +/** + * @brief Create an empty executable. + * + * @param[in] profile Profile used in the executable. + * + * @param[in] default_float_rounding_mode Default floating-point rounding mode + * used in the executable. Allowed rounding modes are near and zero (default is + * not allowed). + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] executable Memory location where the HSA runtime stores newly + * created executable handle. The initial state of the executable is + * ::HSA_EXECUTABLE_STATE_UNFROZEN. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or + * @p executable is NULL. + */ +hsa_status_t HSA_API hsa_executable_create_alt( + hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, + hsa_executable_t *executable); + +/** + * @brief Destroy an executable. + * + * @details An executable handle becomes invalid after the executable has been + * destroyed. Code object handles that were loaded into this executable are + * still valid after the executable has been destroyed, and can be used as + * intended. Resources allocated outside and associated with this executable + * (such as external global or readonly variables) can be released after the + * executable has been destroyed. + * + * Executable should not be destroyed while kernels are in flight. + * + * @param[in] executable Executable. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + */ +hsa_status_t HSA_API hsa_executable_destroy( + hsa_executable_t executable); + +/** + * @brief Loaded code object handle. + */ +typedef struct hsa_loaded_code_object_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_loaded_code_object_t; + +/** + * @brief Load a program code object into an executable. + * + * @details A program code object contains information about resources that are + * accessible by all kernel agents that run the executable, and can be loaded + * at most once into an executable. + * + * If the program code object uses extensions, the implementation must support + * them for this operation to return successfully. + * + * @param[in] executable Executable. + * + * @param[in] code_object_reader A code object reader that holds the program + * code object to load. If a code object reader is destroyed before all the + * associated executables are destroyed, the behavior is undefined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] loaded_code_object Pointer to a memory location where the HSA + * runtime stores the loaded code object handle. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader + * is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The program code object is + * not compatible with the executable or the implementation (for example, the + * code object uses an extension that is not supported by the implementation). + */ +hsa_status_t HSA_API hsa_executable_load_program_code_object( + hsa_executable_t executable, + hsa_code_object_reader_t code_object_reader, + const char *options, + hsa_loaded_code_object_t *loaded_code_object); + +/** + * @brief Load an agent code object into an executable. + * + * @details The agent code object contains all defined agent + * allocation variables, functions, indirect functions, and kernels in a given + * program for a given instruction set architecture. + * + * Any module linkage declaration must have been defined either by a define + * variable or by loading a code object that has a symbol with module linkage + * definition. + * + * The default floating-point rounding mode of the code object associated with + * @p code_object_reader must match that of the executable + * (::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE), or be default (in which + * case the value of ::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE is used). + * If the agent code object uses extensions, the implementation and the agent + * must support them for this operation to return successfully. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent to load code object for. A code object can be loaded + * into an executable at most once for a given agent. The instruction set + * architecture of the code object must be supported by the agent. + * + * @param[in] code_object_reader A code object reader that holds the code object + * to load. If a code object reader is destroyed before all the associated + * executables are destroyed, the behavior is undefined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] loaded_code_object Pointer to a memory location where the HSA + * runtime stores the loaded code object handle. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader + * is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The code object read by @p + * code_object_reader is not compatible with the agent (for example, the agent + * does not support the instruction set architecture of the code object), the + * executable (for example, there is a default floating-point mode mismatch + * between the two), or the implementation. + */ +hsa_status_t HSA_API hsa_executable_load_agent_code_object( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_reader_t code_object_reader, + const char *options, + hsa_loaded_code_object_t *loaded_code_object); + +/** + * @brief Freeze the executable. + * + * @details No modifications to executable can be made after freezing: no code + * objects can be loaded to the executable, and no external variables can be + * defined. Freezing the executable does not prevent querying the executable's + * attributes. The application must define all the external variables in an + * executable before freezing it. + * + * @param[in] executable Executable. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_UNDEFINED One or more variables are + * undefined in the executable. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is already frozen. + */ +hsa_status_t HSA_API hsa_executable_freeze( + hsa_executable_t executable, + const char *options); + +/** + * @brief Executable attributes. + */ +typedef enum { + /** + * Profile this executable is created for. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_EXECUTABLE_INFO_PROFILE = 1, + /** + * Executable state. The type of this attribute is ::hsa_executable_state_t. + */ + HSA_EXECUTABLE_INFO_STATE = 2, + /** + * Default floating-point rounding mode specified when executable was created. + * The type of this attribute is ::hsa_default_float_rounding_mode_t. + */ + HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 3 +} hsa_executable_info_t; + +/** + * @brief Get the current value of an attribute for a given executable. + * + * @param[in] executable Executable. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * executable attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_executable_get_info( + hsa_executable_t executable, + hsa_executable_info_t attribute, + void *value); + +/** + * @brief Define an external global variable with program allocation. + * + * @details This function allows the application to provide the definition + * of a variable in the global segment memory with program allocation. The + * variable must be defined before loading a code object into an executable. + * In addition, code objects loaded must not define the variable. + * + * @param[in] executable Executable. Must not be in frozen state. + * + * @param[in] variable_name Name of the variable. The Programmer's Reference + * Manual describes the standard name mangling scheme. + * + * @param[in] address Address where the variable is defined. This address must + * be in global memory and can be read and written by any agent in the + * system. The application cannot deallocate the buffer pointed by @p address + * before @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + */ +hsa_status_t HSA_API hsa_executable_global_variable_define( + hsa_executable_t executable, + const char *variable_name, + void *address); + +/** + * @brief Define an external global variable with agent allocation. + * + * @details This function allows the application to provide the definition + * of a variable in the global segment memory with agent allocation. The + * variable must be defined before loading a code object into an executable. + * In addition, code objects loaded must not define the variable. + * + * @param[in] executable Executable. Must not be in frozen state. + * + * @param[in] agent Agent for which the variable is being defined. + * + * @param[in] variable_name Name of the variable. The Programmer's Reference + * Manual describes the standard name mangling scheme. + * + * @param[in] address Address where the variable is defined. This address must + * have been previously allocated using ::hsa_memory_allocate in a global region + * that is only visible to @p agent. The application cannot deallocate the + * buffer pointed by @p address before @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + */ +hsa_status_t HSA_API hsa_executable_agent_global_variable_define( + hsa_executable_t executable, + hsa_agent_t agent, + const char *variable_name, + void *address); + +/** + * @brief Define an external readonly variable. + * + * @details This function allows the application to provide the definition + * of a variable in the readonly segment memory. The variable must be defined + * before loading a code object into an executable. In addition, code objects + * loaded must not define the variable. + * + * @param[in] executable Executable. Must not be in frozen state. + * + * @param[in] agent Agent for which the variable is being defined. + * + * @param[in] variable_name Name of the variable. The Programmer's Reference + * Manual describes the standard name mangling scheme. + * + * @param[in] address Address where the variable is defined. This address must + * have been previously allocated using ::hsa_memory_allocate in a readonly + * region associated with @p agent. The application cannot deallocate the buffer + * pointed by @p address before @p executable is destroyed. + * + * @param[in] address Address where the variable is defined. The buffer pointed + * by @p address is owned by the application, and cannot be deallocated before + * @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE Executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + */ +hsa_status_t HSA_API hsa_executable_readonly_variable_define( + hsa_executable_t executable, + hsa_agent_t agent, + const char *variable_name, + void *address); + +/** + * @brief Validate an executable. Checks that all code objects have matching + * machine model, profile, and default floating-point rounding mode. Checks that + * all declarations have definitions. Checks declaration-definition + * compatibility (see the HSA Programming Reference Manual for compatibility + * rules). Invoking this function is equivalent to invoking + * ::hsa_executable_validate_alt with no options. + * + * @param[in] executable Executable. Must be in frozen state. + * + * @param[out] result Memory location where the HSA runtime stores the + * validation result. If the executable passes validation, the result is 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API hsa_executable_validate( + hsa_executable_t executable, + uint32_t *result); + +/** + * @brief Validate an executable. Checks that all code objects have matching + * machine model, profile, and default floating-point rounding mode. Checks that + * all declarations have definitions. Checks declaration-definition + * compatibility (see the HSA Programming Reference Manual for compatibility + * rules). + * + * @param[in] executable Executable. Must be in frozen state. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] result Memory location where the HSA runtime stores the + * validation result. If the executable passes validation, the result is 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API hsa_executable_validate_alt( + hsa_executable_t executable, + const char *options, + uint32_t *result); + +/** + * @brief Executable symbol handle. + * + * The lifetime of an executable object symbol matches that of the executable + * associated with it. An operation on a symbol whose associated executable has + * been destroyed results in undefined behavior. + */ +typedef struct hsa_executable_symbol_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_executable_symbol_t; + +/** + * @deprecated Use ::hsa_executable_get_symbol_by_name instead. + * + * @brief Get the symbol handle for a given a symbol name. + * + * @param[in] executable Executable. + * + * @param[in] module_name Module name. Must be NULL if the symbol has + * program linkage. + * + * @param[in] symbol_name Symbol name. + * + * @param[in] agent Agent associated with the symbol. If the symbol is + * independent of any agent (for example, a variable with program + * allocation), this argument is ignored. + * + * @param[in] call_convention Call convention associated with the symbol. If the + * symbol does not correspond to an indirect function, this argument is ignored. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_get_symbol( + hsa_executable_t executable, + const char *module_name, + const char *symbol_name, + hsa_agent_t agent, + int32_t call_convention, + hsa_executable_symbol_t *symbol); + +/** + * @brief Retrieve the symbol handle corresponding to a given a symbol name. + * + * @param[in] executable Executable. + * + * @param[in] symbol_name Symbol name. Must be a NUL-terminated character + * array. The Programmer's Reference Manual describes the standard name mangling + * scheme. + * + * @param[in] agent Pointer to the agent for which the symbol with the given + * name is defined. If the symbol corresponding to the given name has program + * allocation, @p agent must be NULL. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or @p + * symbol is NULL. + */ +hsa_status_t HSA_API hsa_executable_get_symbol_by_name( + hsa_executable_t executable, + const char *symbol_name, + const hsa_agent_t *agent, + hsa_executable_symbol_t *symbol); + +/** + * @brief Symbol type. + */ +typedef enum { + /** + * Variable. + */ + HSA_SYMBOL_KIND_VARIABLE = 0, + /** + * Kernel. + */ + HSA_SYMBOL_KIND_KERNEL = 1, + /** + * Indirect function. + */ + HSA_SYMBOL_KIND_INDIRECT_FUNCTION = 2 +} hsa_symbol_kind_t; + +/** + * @brief Linkage type of a symbol. + */ +typedef enum { + /** + * Module linkage. + */ + HSA_SYMBOL_LINKAGE_MODULE = 0, + /** + * Program linkage. + */ + HSA_SYMBOL_LINKAGE_PROGRAM = 1 +} hsa_symbol_linkage_t; + +/** + * @brief Allocation type of a variable. + */ +typedef enum { + /** + * Agent allocation. + */ + HSA_VARIABLE_ALLOCATION_AGENT = 0, + /** + * Program allocation. + */ + HSA_VARIABLE_ALLOCATION_PROGRAM = 1 +} hsa_variable_allocation_t; + +/** + * @brief Memory segment associated with a variable. + */ +typedef enum { + /** + * Global memory segment. + */ + HSA_VARIABLE_SEGMENT_GLOBAL = 0, + /** + * Readonly memory segment. + */ + HSA_VARIABLE_SEGMENT_READONLY = 1 +} hsa_variable_segment_t; + +/** + * @brief Executable symbol attributes. + */ +typedef enum { + /** + * The kind of the symbol. The type of this attribute is ::hsa_symbol_kind_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0, + /** + * The length of the symbol name in bytes, not including the NUL terminator. + * The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1, + /** + * The name of the symbol. The type of this attribute is character array with + * the length equal to the value of ::HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH + * attribute. + */ + HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2, + /** + * @deprecated + * + * The length of the module name in bytes (not including the NUL terminator) + * to which this symbol belongs if this symbol has module linkage, otherwise 0 + * is returned. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3, + /** + * @deprecated + * + * The module name to which this symbol belongs if this symbol has module + * linkage, otherwise an empty string is returned. The type of this attribute + * is character array with the length equal to the value of + * ::HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute. + */ + HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME = 4, + /** + * @deprecated + * + * Agent associated with this symbol. If the symbol is a variable, the + * value of this attribute is only defined if + * ::HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION is + * ::HSA_VARIABLE_ALLOCATION_AGENT. The type of this attribute is hsa_agent_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_AGENT = 20, + /** + * The address of the variable. The value of this attribute is undefined if + * the symbol is not a variable. The type of this attribute is uint64_t. + * + * If executable's state is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 is + * returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21, + /** + * The linkage kind of the symbol. The type of this attribute is + * ::hsa_symbol_linkage_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE = 5, + /** + * Indicates whether the symbol corresponds to a definition. The type of this + * attribute is bool. + */ + HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION = 17, + /** + * @deprecated + * + * The allocation kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_allocation_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6, + /** + * @deprecated + * + * The segment kind of the variable. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_segment_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT = 7, + /** + * @deprecated + * + * Alignment of the symbol in memory. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is uint32_t. + * + * The current alignment of the variable in memory may be greater than the + * value specified in the source program variable declaration. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8, + /** + * @deprecated + * + * Size of the variable. The value of this attribute is undefined if + * the symbol is not a variable. The type of this attribute is uint32_t. + * + * A value of 0 is returned if the variable is an external variable and has an + * unknown dimension. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9, + /** + * @deprecated + * + * Indicates whether the variable is constant. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * bool. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST = 10, + /** + * Kernel object handle, used in the kernel dispatch packet. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint64_t. + * + * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 + * is returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22, + /** + * Size of kernarg segment memory that is required to hold the values of the + * kernel arguments, in bytes. Must be a multiple of 16. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, + /** + * Alignment (in bytes) of the buffer used to pass arguments to the kernel, + * which is the maximum of 16 and the maximum alignment of any of the kernel + * arguments. The value of this attribute is undefined if the symbol is not a + * kernel. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12, + /** + * Size of static group segment memory required by the kernel (per + * work-group), in bytes. The value of this attribute is undefined + * if the symbol is not a kernel. The type of this attribute is uint32_t. + * + * The reported amount does not include any dynamically allocated group + * segment memory that may be requested by the application when a kernel is + * dispatched. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, + /** + * Size of static private, spill, and arg segment memory required by + * this kernel (per work-item), in bytes. The value of this attribute is + * undefined if the symbol is not a kernel. The type of this attribute is + * uint32_t. + * + * If the value of ::HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is + * true, the kernel may use more private memory than the reported value, and + * the application must add the dynamic call stack usage to @a + * private_segment_size when populating a kernel dispatch packet. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, + /** + * Dynamic callstack flag. The value of this attribute is undefined if the + * symbol is not a kernel. The type of this attribute is bool. + * + * If this flag is set (the value is true), the kernel uses a dynamically + * sized call stack. This can happen if recursive calls, calls to indirect + * functions, or the HSAIL alloca instruction are present in the kernel. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, + /** + * @deprecated + * + * Call convention of the kernel. The value of this attribute is undefined if + * the symbol is not a kernel. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18, + /** + * Indirect function object handle. The value of this attribute is undefined + * if the symbol is not an indirect function, or the associated agent does + * not support the Full Profile. The type of this attribute depends on the + * machine model: the type is uint32_t for small machine model, and uint64_t + * for large model. + * + * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 + * is returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_OBJECT = 23, + /** + * @deprecated + * + * Call convention of the indirect function. The value of this attribute is + * undefined if the symbol is not an indirect function, or the associated + * agent does not support the Full Profile. The type of this attribute is + * uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16 +} hsa_executable_symbol_info_t; + +/** + * @brief Get the current value of an attribute for a given executable symbol. + * + * @param[in] executable_symbol Executable symbol. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL The executable symbol is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * executable symbol attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_executable_symbol_get_info( + hsa_executable_symbol_t executable_symbol, + hsa_executable_symbol_info_t attribute, + void *value); + +/** + * @deprecated + * + * @brief Iterate over the symbols in a executable, and invoke an + * application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_iterate_symbols( + hsa_executable_t executable, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_executable_symbol_t symbol, + void *data), + void *data); + +/** + * @brief Iterate over the kernels, indirect functions, and agent allocation + * variables in an executable for a given agent, and invoke an application- + * defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_executable_iterate_agent_symbols( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_agent_t agent, + hsa_executable_symbol_t symbol, + void *data), + void *data); + +/** + * @brief Iterate over the program allocation variables in an executable, and + * invoke an application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_executable_iterate_program_symbols( + hsa_executable_t executable, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_executable_symbol_t symbol, + void *data), + void *data); + +/** @} */ + + +/** \defgroup code-object Code Objects (deprecated). + * @{ + */ + +/** + * @deprecated + * + * @brief Struct containing an opaque handle to a code object, which contains + * ISA for finalized kernels and indirect functions together with information + * about the global or readonly segment variables they reference. + */ +typedef struct hsa_code_object_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_code_object_t; + +/** + * @deprecated + * + * @brief Application data handle that is passed to the serialization + * and deserialization functions. + */ +typedef struct hsa_callback_data_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_callback_data_t; + +/** + * @deprecated + * + * @brief Serialize a code object. Can be used for offline finalization, + * install-time finalization, disk code caching, etc. + * + * @param[in] code_object Code object. + * + * @param[in] alloc_callback Callback function for memory allocation. Must not + * be NULL. The HSA runtime passes three arguments to the callback: the + * allocation size, the application data, and a pointer to a memory location + * where the application stores the allocation result. The HSA runtime invokes + * @p alloc_callback once to allocate a buffer that contains the serialized + * version of @p code_object. If the callback returns a status code other than + * ::HSA_STATUS_SUCCESS, this function returns the same code. + * + * @param[in] callback_data Application data that is passed to @p + * alloc_callback. May be NULL. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] serialized_code_object Memory location where the HSA runtime + * stores a pointer to the serialized code object. Must not be NULL. + * + * @param[out] serialized_code_object_size Memory location where the HSA runtime + * stores the size (in bytes) of @p serialized_code_object. The returned value + * matches the allocation size passed by the HSA runtime to @p + * alloc_callback. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p alloc_callback, @p + * serialized_code_object, or @p serialized_code_object_size are NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_serialize( + hsa_code_object_t code_object, + hsa_status_t (*alloc_callback)(size_t size, + hsa_callback_data_t data, + void **address), + hsa_callback_data_t callback_data, + const char *options, + void **serialized_code_object, + size_t *serialized_code_object_size); + +/** + * @deprecated + * + * @brief Deserialize a code object. + * + * @param[in] serialized_code_object A serialized code object. Must not be NULL. + * + * @param[in] serialized_code_object_size The size (in bytes) of @p + * serialized_code_object. Must not be 0. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] code_object Memory location where the HSA runtime stores the + * deserialized code object. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p serialized_code_object, or @p + * code_object are NULL, or @p serialized_code_object_size is 0. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_deserialize( + void *serialized_code_object, + size_t serialized_code_object_size, + const char *options, + hsa_code_object_t *code_object); + +/** + * @deprecated + * + * @brief Destroy a code object. + * + * @details The lifetime of a code object must exceed that of any executable + * where it has been loaded. If an executable that loaded @p code_object has not + * been destroyed, the behavior is undefined. + * + * @param[in] code_object Code object. The handle becomes invalid after it has + * been destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_destroy( + hsa_code_object_t code_object); + +/** + * @deprecated + * + * @brief Code object type. + */ +typedef enum { + /** + * Produces code object that contains ISA for all kernels and indirect + * functions in HSA source. + */ + HSA_CODE_OBJECT_TYPE_PROGRAM = 0 +} hsa_code_object_type_t; + +/** + * @deprecated + * + * @brief Code object attributes. + */ +typedef enum { + /** + * The version of the code object. The type of this attribute is a + * NUL-terminated char[64]. The name must be at most 63 characters long (not + * including the NUL terminator) and all array elements not used for the name + * must be NUL. + */ + HSA_CODE_OBJECT_INFO_VERSION = 0, + /** + * Type of code object. The type of this attribute is + * ::hsa_code_object_type_t. + */ + HSA_CODE_OBJECT_INFO_TYPE = 1, + /** + * Instruction set architecture this code object is produced for. The type of + * this attribute is ::hsa_isa_t. + */ + HSA_CODE_OBJECT_INFO_ISA = 2, + /** + * Machine model this code object is produced for. The type of this attribute + * is ::hsa_machine_model_t. + */ + HSA_CODE_OBJECT_INFO_MACHINE_MODEL = 3, + /** + * Profile this code object is produced for. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_CODE_OBJECT_INFO_PROFILE = 4, + /** + * Default floating-point rounding mode used when the code object is + * produced. The type of this attribute is + * ::hsa_default_float_rounding_mode_t. + */ + HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5 +} hsa_code_object_info_t; + +/** + * @deprecated + * + * @brief Get the current value of an attribute for a given code object. + * + * @param[in] code_object Code object. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * code object attribute, or @p value is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_info( + hsa_code_object_t code_object, + hsa_code_object_info_t attribute, + void *value); + +/** + * @deprecated + * + * @brief Load code object into the executable. + * + * @details Every global or readonly variable that is external must be defined + * before loading the code object. An internal global or readonly variable is + * allocated once the code object, that is being loaded, references this + * variable and this variable is not allocated. + * + * Any module linkage declaration must have been defined either by a define + * variable or by loading a code object that has a symbol with module linkage + * definition. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent to load code object for. The agent must support the + * default floating-point rounding mode used by @p code_object. + * + * @param[in] code_object Code object to load. The lifetime of the code object + * must exceed that of the executable: if @p code_object is destroyed before @p + * executable, the behavior is undefined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p agent is not compatible + * with @p code_object (for example, @p agent does not support the default + * floating-point rounding mode specified by @p code_object), or @p code_object + * is not compatible with @p executable (for example, @p code_object and @p + * executable have different machine models or profiles). + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_load_code_object( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options); + +/** + * @deprecated + * + * @brief Code object symbol handle. + * + * The lifetime of a code object symbol matches that of the code object + * associated with it. An operation on a symbol whose associated code object has + * been destroyed results in undefined behavior. + */ +typedef struct hsa_code_symbol_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_code_symbol_t; + +/** + * @deprecated + * + * @brief Get the symbol handle within a code object for a given a symbol name. + * + * @param[in] code_object Code object. + * + * @param[in] symbol_name Symbol name. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol( + hsa_code_object_t code_object, + const char *symbol_name, + hsa_code_symbol_t *symbol); + +/** + * @deprecated + * + * @brief Get the symbol handle within a code object for a given a symbol name. + * + * @param[in] code_object Code object. + * + * @param[in] module_name Module name. Must be NULL if the symbol has + * program linkage. + * + * @param[in] symbol_name Symbol name. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol_from_name( + hsa_code_object_t code_object, + const char *module_name, + const char *symbol_name, + hsa_code_symbol_t *symbol); + +/** + * @deprecated + * + * @brief Code object symbol attributes. + */ +typedef enum { + /** + * The type of the symbol. The type of this attribute is ::hsa_symbol_kind_t. + */ + HSA_CODE_SYMBOL_INFO_TYPE = 0, + /** + * The length of the symbol name in bytes, not including the NUL terminator. + * The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_NAME_LENGTH = 1, + /** + * The name of the symbol. The type of this attribute is character array with + * the length equal to the value of ::HSA_CODE_SYMBOL_INFO_NAME_LENGTH + * attribute. + */ + HSA_CODE_SYMBOL_INFO_NAME = 2, + /** + * The length of the module name in bytes (not including the NUL terminator) + * to which this symbol belongs if this symbol has module linkage, otherwise 0 + * is returned. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3, + /** + * The module name to which this symbol belongs if this symbol has module + * linkage, otherwise an empty string is returned. The type of this attribute + * is character array with the length equal to the value of + * ::HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute. + */ + HSA_CODE_SYMBOL_INFO_MODULE_NAME = 4, + /** + * The linkage kind of the symbol. The type of this attribute is + * ::hsa_symbol_linkage_t. + */ + HSA_CODE_SYMBOL_INFO_LINKAGE = 5, + /** + * Indicates whether the symbol corresponds to a definition. The type of this + * attribute is bool. + */ + HSA_CODE_SYMBOL_INFO_IS_DEFINITION = 17, + /** + * The allocation kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_allocation_t. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6, + /** + * The segment kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_segment_t. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT = 7, + /** + * Alignment of the symbol in memory. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is uint32_t. + * + * The current alignment of the variable in memory may be greater than the + * value specified in the source program variable declaration. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8, + /** + * Size of the variable. The value of this attribute is undefined if the + * symbol is not a variable. The type of this attribute is uint32_t. + * + * A size of 0 is returned if the variable is an external variable and has an + * unknown dimension. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE = 9, + /** + * Indicates whether the variable is constant. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * bool. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST = 10, + /** + * Size of kernarg segment memory that is required to hold the values of the + * kernel arguments, in bytes. Must be a multiple of 16. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, + /** + * Alignment (in bytes) of the buffer used to pass arguments to the kernel, + * which is the maximum of 16 and the maximum alignment of any of the kernel + * arguments. The value of this attribute is undefined if the symbol is not a + * kernel. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12, + /** + * Size of static group segment memory required by the kernel (per + * work-group), in bytes. The value of this attribute is undefined + * if the symbol is not a kernel. The type of this attribute is uint32_t. + * + * The reported amount does not include any dynamically allocated group + * segment memory that may be requested by the application when a kernel is + * dispatched. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, + /** + * Size of static private, spill, and arg segment memory required by + * this kernel (per work-item), in bytes. The value of this attribute is + * undefined if the symbol is not a kernel. The type of this attribute is + * uint32_t. + * + * If the value of ::HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is true, + * the kernel may use more private memory than the reported value, and the + * application must add the dynamic call stack usage to @a + * private_segment_size when populating a kernel dispatch packet. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, + /** + * Dynamic callstack flag. The value of this attribute is undefined if the + * symbol is not a kernel. The type of this attribute is bool. + * + * If this flag is set (the value is true), the kernel uses a dynamically + * sized call stack. This can happen if recursive calls, calls to indirect + * functions, or the HSAIL alloca instruction are present in the kernel. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, + /** + * Call convention of the kernel. The value of this attribute is undefined if + * the symbol is not a kernel. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18, + /** + * Call convention of the indirect function. The value of this attribute is + * undefined if the symbol is not an indirect function. The type of this + * attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16, + /** + * Wavefront size used by the kernel. The value of this attribute is either + * 32 or 64. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE = 19 +} hsa_code_symbol_info_t; + +/** + * @deprecated + * + * @brief Get the current value of an attribute for a given code symbol. + * + * @param[in] code_symbol Code symbol. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_SYMBOL The code symbol is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * code symbol attribute, or @p value is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_symbol_get_info( + hsa_code_symbol_t code_symbol, + hsa_code_symbol_info_t attribute, + void *value); + +/** + * @deprecated + * + * @brief Iterate over the symbols in a code object, and invoke an + * application-defined callback on every iteration. + * + * @param[in] code_object Code object. + * + * @param[in] callback Callback to be invoked once per code object symbol. The + * HSA runtime passes three arguments to the callback: the code object, a + * symbol, and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_code_object_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_iterate_symbols( + hsa_code_object_t code_object, + hsa_status_t (*callback)(hsa_code_object_t code_object, + hsa_code_symbol_t symbol, + void *data), + void *data); + +/** @} */ + +#ifdef __cplusplus +} // end extern "C" block +#endif + +#endif // header guard diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_amd_tool.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_amd_tool.h new file mode 100644 index 0000000000..22847a8a44 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_amd_tool.h @@ -0,0 +1,97 @@ +/* + * Copyright © Advanced Micro Devices, Inc., or its affiliates. + * + * SPDX-License-Identifier: MIT + */ + +#ifndef HSA_RUNTIME_AMD_TOOL_EVENTS_H_ +#define HSA_RUNTIME_AMD_TOOL_EVENTS_H_ + +// Insert license header + +#include +#include +#include "hsa.h" + + +typedef enum { + HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_NONE = 0, + HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_USE_ONCE = + (1 << 0), // This scratch allocation is only valid for 1 dispatch. + HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_ALT = + (1 << 1), // Used alternate scratch instead of main scratch +} hsa_amd_event_scratch_alloc_flag_t; + +typedef enum { + HSA_AMD_TOOL_EVENT_MIN = 0, + + // Scratch memory tracking + HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_START, + HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_END, + HSA_AMD_TOOL_EVENT_SCRATCH_FREE_START, + HSA_AMD_TOOL_EVENT_SCRATCH_FREE_END, + HSA_AMD_TOOL_EVENT_SCRATCH_ASYNC_RECLAIM_START, + HSA_AMD_TOOL_EVENT_SCRATCH_ASYNC_RECLAIM_END, + + // Add new events above ^ + HSA_AMD_TOOL_EVENT_MAX +} hsa_amd_tool_event_kind_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; +} hsa_amd_tool_event_none_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; + uint64_t dispatch_id; // Dispatch ID of the AQL packet that needs more scratch memory +} hsa_amd_event_scratch_alloc_start_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; + uint64_t dispatch_id; // Dispatch ID of the AQL packet that needs more scratch memory + size_t size; // Amount of scratch allocated - in bytes + size_t num_slots; // limit of number of waves +} hsa_amd_event_scratch_alloc_end_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; +} hsa_amd_event_scratch_free_start_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; +} hsa_amd_event_scratch_free_end_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; +} hsa_amd_event_scratch_async_reclaim_start_t; + +typedef struct { + hsa_amd_tool_event_kind_t kind; + const hsa_queue_t* queue; + hsa_amd_event_scratch_alloc_flag_t flags; +} hsa_amd_event_scratch_async_reclaim_end_t; + +typedef union { + const hsa_amd_tool_event_none_t* none; + const hsa_amd_event_scratch_alloc_start_t* scratch_alloc_start; + const hsa_amd_event_scratch_alloc_end_t* scratch_alloc_end; + const hsa_amd_event_scratch_free_start_t* scratch_free_start; + const hsa_amd_event_scratch_free_end_t* scratch_free_end; + const hsa_amd_event_scratch_async_reclaim_start_t* scratch_async_reclaim_start; + const hsa_amd_event_scratch_async_reclaim_end_t* scratch_async_reclaim_end; +} hsa_amd_tool_event_t; + +typedef hsa_status_t (*hsa_amd_tool_event)(hsa_amd_tool_event_t); + + +#endif \ No newline at end of file diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace.h new file mode 100644 index 0000000000..cc33320269 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace.h @@ -0,0 +1,587 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_H +#define HSA_RUNTIME_INC_HSA_API_TRACE_H + +#include "hsa.h" +#include "hsa_api_trace_version.h" +#ifdef AMD_INTERNAL_BUILD +#include "hsa_ext_image.h" +#include "hsa_ext_amd.h" +#include "hsa_ext_finalize.h" +#include "hsa_amd_tool.h" +#include "hsa_ven_amd_pc_sampling.h" +#else +#include "inc/hsa_ext_image.h" +#include "inc/hsa_ext_amd.h" +#include "inc/hsa_ext_finalize.h" +#include "inc/hsa_amd_tool.h" +#include "inc/hsa_ven_amd_pc_sampling.h" +#endif + +#include +#include +#include + +// Table MAJOR_VERSION and STEP_VERSION defines have moved to hsa_api_trace_version.h + +// Min function used to copy Api Tables +static inline uint32_t Min(const uint32_t a, const uint32_t b) { + return (a > b) ? b : a; +} + +// Declarations of APIs intended for use only by tools. + +// An AQL packet that can be put in an intercept queue to cause a callback to +// be invoked when the packet is about to be submitted to the underlying +// hardware queue. These packets are not copied to the underlying hardware +// queue. These packets should come immediately before the regular AQL packet +// they relate to. This implies that packet rewriters should always keep these +// packets adjacent to the regular AQL packet that follows them. +const uint32_t AMD_AQL_FORMAT_INTERCEPT_MARKER = 0xFE; + +struct amd_aql_intercept_marker_s; + +// When an intercept queue is processing rewritten packets to put them on the +// underlying hardware queue, if it encounters a +// AMD_AQL_FORMAT_INTERCEPT_MARKER vendor AQL packet it will call the following +// handler. packet points to the packet, queue is the underlying hardware +// queue, and packet_id is the packet id of the next packet to be put on the +// underlying hardware queue. The intercept queue does not put these packets +// onto the underlying hardware queue. +typedef void (*amd_intercept_marker_handler)(const struct amd_aql_intercept_marker_s* packet, + hsa_queue_t* queue, uint64_t packet_id); +// An AQL vendor packet used by the intercept queue to mark the following +// packet. The callback will be invoked to allow a tool to know where in the +// underlying hardware queue the following packet will be placed. user_data can +// be used to hold any data useful to the tool. +typedef struct amd_aql_intercept_marker_s { + uint16_t header; // Must have a packet type of HSA_PACKET_TYPE_VENDOR_SPECIFIC. + uint8_t format; // Must be AMD_AQL_FORMAT_INTERCEPT_MARKER. + uint8_t reserved[5]; // Must be 0. +#ifdef HSA_LARGE_MODEL + amd_intercept_marker_handler callback; +#elif defined HSA_LITTLE_ENDIAN + amd_intercept_marker_handler callback; + uint32_t reserved1; // Must be 0. +#else + uint32_t reserved1; // Must be 0. + amd_intercept_marker_handler callback; +#endif + uint64_t user_data[6]; +} amd_aql_intercept_marker_t; + +typedef void (*hsa_amd_queue_intercept_packet_writer)(const void* pkts, uint64_t pkt_count); +typedef void (*hsa_amd_queue_intercept_handler)(const void* pkts, uint64_t pkt_count, + uint64_t user_pkt_index, void* data, + hsa_amd_queue_intercept_packet_writer writer); +hsa_status_t hsa_amd_queue_intercept_register(hsa_queue_t* queue, + hsa_amd_queue_intercept_handler callback, + void* user_data); +hsa_status_t hsa_amd_queue_intercept_create( + hsa_agent_t agent_handle, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data, + uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue); + +typedef void (*hsa_amd_runtime_queue_notifier)(const hsa_queue_t* queue, hsa_agent_t agent, + void* data); +hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifier callback, + void* user_data); + +// Structure of Version used to identify an instance of Api table +// Must be the first member (offsetof == 0) of all API tables. +// This is the root of the table passing ABI. +struct ApiTableVersion { + uint32_t major_id; + uint32_t minor_id; + uint32_t step_id; + uint32_t reserved; +}; + +struct ToolsApiTable { + ApiTableVersion version; + + hsa_amd_tool_event hsa_amd_tool_scratch_event_alloc_start_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_alloc_end_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_free_start_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_free_end_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_async_reclaim_start_fn; + hsa_amd_tool_event hsa_amd_tool_scratch_event_async_reclaim_end_fn; +}; + +// Table to export HSA Finalizer Extension Apis +struct FinalizerExtTable { + ApiTableVersion version; + decltype(hsa_ext_program_create)* hsa_ext_program_create_fn; + decltype(hsa_ext_program_destroy)* hsa_ext_program_destroy_fn; + decltype(hsa_ext_program_add_module)* hsa_ext_program_add_module_fn; + decltype(hsa_ext_program_iterate_modules)* hsa_ext_program_iterate_modules_fn; + decltype(hsa_ext_program_get_info)* hsa_ext_program_get_info_fn; + decltype(hsa_ext_program_finalize)* hsa_ext_program_finalize_fn; +}; + +// Table to export HSA Image Extension Apis +struct ImageExtTable { + ApiTableVersion version; + decltype(hsa_ext_image_get_capability)* hsa_ext_image_get_capability_fn; + decltype(hsa_ext_image_data_get_info)* hsa_ext_image_data_get_info_fn; + decltype(hsa_ext_image_create)* hsa_ext_image_create_fn; + decltype(hsa_ext_image_import)* hsa_ext_image_import_fn; + decltype(hsa_ext_image_export)* hsa_ext_image_export_fn; + decltype(hsa_ext_image_copy)* hsa_ext_image_copy_fn; + decltype(hsa_ext_image_clear)* hsa_ext_image_clear_fn; + decltype(hsa_ext_image_destroy)* hsa_ext_image_destroy_fn; + decltype(hsa_ext_sampler_create)* hsa_ext_sampler_create_fn; + decltype(hsa_ext_sampler_destroy)* hsa_ext_sampler_destroy_fn; + decltype(hsa_ext_image_get_capability_with_layout)* hsa_ext_image_get_capability_with_layout_fn; + decltype(hsa_ext_image_data_get_info_with_layout)* hsa_ext_image_data_get_info_with_layout_fn; + decltype(hsa_ext_image_create_with_layout)* hsa_ext_image_create_with_layout_fn; + decltype(hsa_ext_sampler_create_v2)* hsa_ext_sampler_create_v2_fn; + +}; + +// Table to export HSA PC Sampling Extension Apis +struct PcSamplingExtTable { + ApiTableVersion version; + decltype(hsa_ven_amd_pcs_iterate_configuration)* hsa_ven_amd_pcs_iterate_configuration_fn; + decltype(hsa_ven_amd_pcs_create)* hsa_ven_amd_pcs_create_fn; + decltype(hsa_ven_amd_pcs_create_from_id)* hsa_ven_amd_pcs_create_from_id_fn; + decltype(hsa_ven_amd_pcs_destroy)* hsa_ven_amd_pcs_destroy_fn; + decltype(hsa_ven_amd_pcs_start)* hsa_ven_amd_pcs_start_fn; + decltype(hsa_ven_amd_pcs_stop)* hsa_ven_amd_pcs_stop_fn; + decltype(hsa_ven_amd_pcs_flush)* hsa_ven_amd_pcs_flush_fn; +}; + + +// Table to export AMD Extension Apis +struct AmdExtTable { + ApiTableVersion version; + decltype(hsa_amd_coherency_get_type)* hsa_amd_coherency_get_type_fn; + decltype(hsa_amd_coherency_set_type)* hsa_amd_coherency_set_type_fn; + decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled_fn; + decltype(hsa_amd_profiling_async_copy_enable) *hsa_amd_profiling_async_copy_enable_fn; + decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time_fn; + decltype(hsa_amd_profiling_get_async_copy_time) *hsa_amd_profiling_get_async_copy_time_fn; + decltype(hsa_amd_profiling_convert_tick_to_system_domain)* hsa_amd_profiling_convert_tick_to_system_domain_fn; + decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler_fn; + decltype(hsa_amd_async_function)* hsa_amd_async_function_fn; + decltype(hsa_amd_signal_wait_any)* hsa_amd_signal_wait_any_fn; + decltype(hsa_amd_queue_cu_set_mask)* hsa_amd_queue_cu_set_mask_fn; + decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn; + decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn; + decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn; + decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; + decltype(hsa_amd_memory_async_copy_on_engine)* hsa_amd_memory_async_copy_on_engine_fn; + decltype(hsa_amd_memory_copy_engine_status)* hsa_amd_memory_copy_engine_status_fn; + decltype(hsa_amd_agent_memory_pool_get_info)* hsa_amd_agent_memory_pool_get_info_fn; + decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn; + decltype(hsa_amd_memory_pool_can_migrate)* hsa_amd_memory_pool_can_migrate_fn; + decltype(hsa_amd_memory_migrate)* hsa_amd_memory_migrate_fn; + decltype(hsa_amd_memory_lock)* hsa_amd_memory_lock_fn; + decltype(hsa_amd_memory_unlock)* hsa_amd_memory_unlock_fn; + decltype(hsa_amd_memory_fill)* hsa_amd_memory_fill_fn; + decltype(hsa_amd_interop_map_buffer)* hsa_amd_interop_map_buffer_fn; + decltype(hsa_amd_interop_unmap_buffer)* hsa_amd_interop_unmap_buffer_fn; + decltype(hsa_amd_image_create)* hsa_amd_image_create_fn; + decltype(hsa_amd_pointer_info)* hsa_amd_pointer_info_fn; + decltype(hsa_amd_pointer_info_set_userdata)* hsa_amd_pointer_info_set_userdata_fn; + decltype(hsa_amd_ipc_memory_create)* hsa_amd_ipc_memory_create_fn; + decltype(hsa_amd_ipc_memory_attach)* hsa_amd_ipc_memory_attach_fn; + decltype(hsa_amd_ipc_memory_detach)* hsa_amd_ipc_memory_detach_fn; + decltype(hsa_amd_signal_create)* hsa_amd_signal_create_fn; + decltype(hsa_amd_ipc_signal_create)* hsa_amd_ipc_signal_create_fn; + decltype(hsa_amd_ipc_signal_attach)* hsa_amd_ipc_signal_attach_fn; + decltype(hsa_amd_register_system_event_handler)* hsa_amd_register_system_event_handler_fn; + decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; + decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; + decltype(hsa_amd_queue_set_priority)* hsa_amd_queue_set_priority_fn; + decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; + decltype(hsa_amd_runtime_queue_create_register)* hsa_amd_runtime_queue_create_register_fn; + decltype(hsa_amd_memory_lock_to_pool)* hsa_amd_memory_lock_to_pool_fn; + decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn; + decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn; + decltype(hsa_amd_signal_value_pointer)* hsa_amd_signal_value_pointer_fn; + decltype(hsa_amd_svm_attributes_set)* hsa_amd_svm_attributes_set_fn; + decltype(hsa_amd_svm_attributes_get)* hsa_amd_svm_attributes_get_fn; + decltype(hsa_amd_svm_prefetch_async)* hsa_amd_svm_prefetch_async_fn; + decltype(hsa_amd_spm_acquire)* hsa_amd_spm_acquire_fn; + decltype(hsa_amd_spm_release)* hsa_amd_spm_release_fn; + decltype(hsa_amd_spm_set_dest_buffer)* hsa_amd_spm_set_dest_buffer_fn; + decltype(hsa_amd_queue_cu_get_mask)* hsa_amd_queue_cu_get_mask_fn; + decltype(hsa_amd_portable_export_dmabuf)* hsa_amd_portable_export_dmabuf_fn; + decltype(hsa_amd_portable_close_dmabuf)* hsa_amd_portable_close_dmabuf_fn; + decltype(hsa_amd_vmem_address_reserve)* hsa_amd_vmem_address_reserve_fn; + decltype(hsa_amd_vmem_address_free)* hsa_amd_vmem_address_free_fn; + decltype(hsa_amd_vmem_handle_create)* hsa_amd_vmem_handle_create_fn; + decltype(hsa_amd_vmem_handle_release)* hsa_amd_vmem_handle_release_fn; + decltype(hsa_amd_vmem_map)* hsa_amd_vmem_map_fn; + decltype(hsa_amd_vmem_unmap)* hsa_amd_vmem_unmap_fn; + decltype(hsa_amd_vmem_set_access)* hsa_amd_vmem_set_access_fn; + decltype(hsa_amd_vmem_get_access)* hsa_amd_vmem_get_access_fn; + decltype(hsa_amd_vmem_export_shareable_handle)* hsa_amd_vmem_export_shareable_handle_fn; + decltype(hsa_amd_vmem_import_shareable_handle)* hsa_amd_vmem_import_shareable_handle_fn; + decltype(hsa_amd_vmem_retain_alloc_handle)* hsa_amd_vmem_retain_alloc_handle_fn; + decltype(hsa_amd_vmem_get_alloc_properties_from_handle)* + hsa_amd_vmem_get_alloc_properties_from_handle_fn; + decltype(hsa_amd_agent_set_async_scratch_limit)* hsa_amd_agent_set_async_scratch_limit_fn; + decltype(hsa_amd_queue_get_info)* hsa_amd_queue_get_info_fn; + decltype(hsa_amd_vmem_address_reserve_align)* hsa_amd_vmem_address_reserve_align_fn; + decltype(hsa_amd_enable_logging)* hsa_amd_enable_logging_fn; + decltype(hsa_amd_signal_wait_all)* hsa_amd_signal_wait_all_fn; + decltype(hsa_amd_memory_get_preferred_copy_engine)* hsa_amd_memory_get_preferred_copy_engine_fn; + decltype(hsa_amd_portable_export_dmabuf_v2)* hsa_amd_portable_export_dmabuf_v2_fn; + decltype(hsa_amd_ais_file_write)* hsa_amd_ais_file_write_fn; + decltype(hsa_amd_ais_file_read)* hsa_amd_ais_file_read_fn; +}; + +// Table to export HSA Core Runtime Apis +struct CoreApiTable { + ApiTableVersion version; + decltype(hsa_init)* hsa_init_fn; + decltype(hsa_shut_down)* hsa_shut_down_fn; + decltype(hsa_system_get_info)* hsa_system_get_info_fn; + decltype(hsa_system_extension_supported)* hsa_system_extension_supported_fn; + decltype(hsa_system_get_extension_table)* hsa_system_get_extension_table_fn; + decltype(hsa_iterate_agents)* hsa_iterate_agents_fn; + decltype(hsa_agent_get_info)* hsa_agent_get_info_fn; + decltype(hsa_queue_create)* hsa_queue_create_fn; + decltype(hsa_soft_queue_create)* hsa_soft_queue_create_fn; + decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + decltype(hsa_queue_inactivate)* hsa_queue_inactivate_fn; + decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn; + decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn; + decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; + decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; + decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn; + decltype(hsa_queue_cas_write_index_scacq_screl)* hsa_queue_cas_write_index_scacq_screl_fn; + decltype(hsa_queue_cas_write_index_scacquire)* hsa_queue_cas_write_index_scacquire_fn; + decltype(hsa_queue_cas_write_index_relaxed)* hsa_queue_cas_write_index_relaxed_fn; + decltype(hsa_queue_cas_write_index_screlease)* hsa_queue_cas_write_index_screlease_fn; + decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl_fn; + decltype(hsa_queue_add_write_index_scacquire)* hsa_queue_add_write_index_scacquire_fn; + decltype(hsa_queue_add_write_index_relaxed)* hsa_queue_add_write_index_relaxed_fn; + decltype(hsa_queue_add_write_index_screlease)* hsa_queue_add_write_index_screlease_fn; + decltype(hsa_queue_store_read_index_relaxed)* hsa_queue_store_read_index_relaxed_fn; + decltype(hsa_queue_store_read_index_screlease)* hsa_queue_store_read_index_screlease_fn; + decltype(hsa_agent_iterate_regions)* hsa_agent_iterate_regions_fn; + decltype(hsa_region_get_info)* hsa_region_get_info_fn; + decltype(hsa_agent_get_exception_policies)* hsa_agent_get_exception_policies_fn; + decltype(hsa_agent_extension_supported)* hsa_agent_extension_supported_fn; + decltype(hsa_memory_register)* hsa_memory_register_fn; + decltype(hsa_memory_deregister)* hsa_memory_deregister_fn; + decltype(hsa_memory_allocate)* hsa_memory_allocate_fn; + decltype(hsa_memory_free)* hsa_memory_free_fn; + decltype(hsa_memory_copy)* hsa_memory_copy_fn; + decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn; + decltype(hsa_signal_create)* hsa_signal_create_fn; + decltype(hsa_signal_destroy)* hsa_signal_destroy_fn; + decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed_fn; + decltype(hsa_signal_load_scacquire)* hsa_signal_load_scacquire_fn; + decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn; + decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease_fn; + decltype(hsa_signal_wait_relaxed)* hsa_signal_wait_relaxed_fn; + decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire_fn; + decltype(hsa_signal_and_relaxed)* hsa_signal_and_relaxed_fn; + decltype(hsa_signal_and_scacquire)* hsa_signal_and_scacquire_fn; + decltype(hsa_signal_and_screlease)* hsa_signal_and_screlease_fn; + decltype(hsa_signal_and_scacq_screl)* hsa_signal_and_scacq_screl_fn; + decltype(hsa_signal_or_relaxed)* hsa_signal_or_relaxed_fn; + decltype(hsa_signal_or_scacquire)* hsa_signal_or_scacquire_fn; + decltype(hsa_signal_or_screlease)* hsa_signal_or_screlease_fn; + decltype(hsa_signal_or_scacq_screl)* hsa_signal_or_scacq_screl_fn; + decltype(hsa_signal_xor_relaxed)* hsa_signal_xor_relaxed_fn; + decltype(hsa_signal_xor_scacquire)* hsa_signal_xor_scacquire_fn; + decltype(hsa_signal_xor_screlease)* hsa_signal_xor_screlease_fn; + decltype(hsa_signal_xor_scacq_screl)* hsa_signal_xor_scacq_screl_fn; + decltype(hsa_signal_exchange_relaxed)* hsa_signal_exchange_relaxed_fn; + decltype(hsa_signal_exchange_scacquire)* hsa_signal_exchange_scacquire_fn; + decltype(hsa_signal_exchange_screlease)* hsa_signal_exchange_screlease_fn; + decltype(hsa_signal_exchange_scacq_screl)* hsa_signal_exchange_scacq_screl_fn; + decltype(hsa_signal_add_relaxed)* hsa_signal_add_relaxed_fn; + decltype(hsa_signal_add_scacquire)* hsa_signal_add_scacquire_fn; + decltype(hsa_signal_add_screlease)* hsa_signal_add_screlease_fn; + decltype(hsa_signal_add_scacq_screl)* hsa_signal_add_scacq_screl_fn; + decltype(hsa_signal_subtract_relaxed)* hsa_signal_subtract_relaxed_fn; + decltype(hsa_signal_subtract_scacquire)* hsa_signal_subtract_scacquire_fn; + decltype(hsa_signal_subtract_screlease)* hsa_signal_subtract_screlease_fn; + decltype(hsa_signal_subtract_scacq_screl)* hsa_signal_subtract_scacq_screl_fn; + decltype(hsa_signal_cas_relaxed)* hsa_signal_cas_relaxed_fn; + decltype(hsa_signal_cas_scacquire)* hsa_signal_cas_scacquire_fn; + decltype(hsa_signal_cas_screlease)* hsa_signal_cas_screlease_fn; + decltype(hsa_signal_cas_scacq_screl)* hsa_signal_cas_scacq_screl_fn; + + //===--- Instruction Set Architecture -----------------------------------===// + + decltype(hsa_isa_from_name)* hsa_isa_from_name_fn; + // Deprecated since v1.1. + decltype(hsa_isa_get_info)* hsa_isa_get_info_fn; + // Deprecated since v1.1. + decltype(hsa_isa_compatible)* hsa_isa_compatible_fn; + + //===--- Code Objects (deprecated) --------------------------------------===// + + // Deprecated since v1.1. + decltype(hsa_code_object_serialize)* hsa_code_object_serialize_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_deserialize)* hsa_code_object_deserialize_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_destroy)* hsa_code_object_destroy_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_get_info)* hsa_code_object_get_info_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_get_symbol)* hsa_code_object_get_symbol_fn; + // Deprecated since v1.1. + decltype(hsa_code_symbol_get_info)* hsa_code_symbol_get_info_fn; + // Deprecated since v1.1. + decltype(hsa_code_object_iterate_symbols)* hsa_code_object_iterate_symbols_fn; + + //===--- Executable -----------------------------------------------------===// + + // Deprecated since v1.1. + decltype(hsa_executable_create)* hsa_executable_create_fn; + decltype(hsa_executable_destroy)* hsa_executable_destroy_fn; + // Deprecated since v1.1. + decltype(hsa_executable_load_code_object)* hsa_executable_load_code_object_fn; + decltype(hsa_executable_freeze)* hsa_executable_freeze_fn; + decltype(hsa_executable_get_info)* hsa_executable_get_info_fn; + decltype(hsa_executable_global_variable_define)* + hsa_executable_global_variable_define_fn; + decltype(hsa_executable_agent_global_variable_define)* + hsa_executable_agent_global_variable_define_fn; + decltype(hsa_executable_readonly_variable_define)* + hsa_executable_readonly_variable_define_fn; + decltype(hsa_executable_validate)* hsa_executable_validate_fn; + // Deprecated since v1.1. + decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol_fn; + decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info_fn; + // Deprecated since v1.1. + decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols_fn; + + //===--- Runtime Notifications ------------------------------------------===// + + decltype(hsa_status_string)* hsa_status_string_fn; + + // Start HSA v1.1 additions + decltype(hsa_extension_get_name)* hsa_extension_get_name_fn; + decltype(hsa_system_major_extension_supported)* hsa_system_major_extension_supported_fn; + decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table_fn; + decltype(hsa_agent_major_extension_supported)* hsa_agent_major_extension_supported_fn; + decltype(hsa_cache_get_info)* hsa_cache_get_info_fn; + decltype(hsa_agent_iterate_caches)* hsa_agent_iterate_caches_fn; + decltype(hsa_signal_silent_store_relaxed)* hsa_signal_silent_store_relaxed_fn; + decltype(hsa_signal_silent_store_screlease)* hsa_signal_silent_store_screlease_fn; + decltype(hsa_signal_group_create)* hsa_signal_group_create_fn; + decltype(hsa_signal_group_destroy)* hsa_signal_group_destroy_fn; + decltype(hsa_signal_group_wait_any_scacquire)* hsa_signal_group_wait_any_scacquire_fn; + decltype(hsa_signal_group_wait_any_relaxed)* hsa_signal_group_wait_any_relaxed_fn; + + //===--- Instruction Set Architecture - HSA v1.1 additions --------------===// + + decltype(hsa_agent_iterate_isas)* hsa_agent_iterate_isas_fn; + decltype(hsa_isa_get_info_alt)* hsa_isa_get_info_alt_fn; + decltype(hsa_isa_get_exception_policies)* hsa_isa_get_exception_policies_fn; + decltype(hsa_isa_get_round_method)* hsa_isa_get_round_method_fn; + decltype(hsa_wavefront_get_info)* hsa_wavefront_get_info_fn; + decltype(hsa_isa_iterate_wavefronts)* hsa_isa_iterate_wavefronts_fn; + + //===--- Code Objects (deprecated) - HSA v1.1 additions -----------------===// + + // Deprecated since v1.1. + decltype(hsa_code_object_get_symbol_from_name)* + hsa_code_object_get_symbol_from_name_fn; + + //===--- Executable - HSA v1.1 additions --------------------------------===// + + decltype(hsa_code_object_reader_create_from_file)* + hsa_code_object_reader_create_from_file_fn; + decltype(hsa_code_object_reader_create_from_memory)* + hsa_code_object_reader_create_from_memory_fn; + decltype(hsa_code_object_reader_destroy)* hsa_code_object_reader_destroy_fn; + decltype(hsa_executable_create_alt)* hsa_executable_create_alt_fn; + decltype(hsa_executable_load_program_code_object)* + hsa_executable_load_program_code_object_fn; + decltype(hsa_executable_load_agent_code_object)* + hsa_executable_load_agent_code_object_fn; + decltype(hsa_executable_validate_alt)* hsa_executable_validate_alt_fn; + decltype(hsa_executable_get_symbol_by_name)* + hsa_executable_get_symbol_by_name_fn; + decltype(hsa_executable_iterate_agent_symbols)* + hsa_executable_iterate_agent_symbols_fn; + decltype(hsa_executable_iterate_program_symbols)* + hsa_executable_iterate_program_symbols_fn; +}; + +// Table to export HSA Apis from Core Runtime, Amd Extensions +// Finalizer and Images +struct HsaApiTable { + + // Version of Hsa Api Table + ApiTableVersion version; + + // Table of function pointers to HSA Core Runtime + CoreApiTable* core_; + + // Table of function pointers to AMD extensions + AmdExtTable* amd_ext_; + + // Table of function pointers to HSA Finalizer Extension + FinalizerExtTable* finalizer_ext_; + + // Table of function pointers to HSA Image Extension + ImageExtTable* image_ext_; + + // Table of function pointers for tools to use + ToolsApiTable* tools_; + + // Table of function pointers to AMD PC Sampling Extension + PcSamplingExtTable* pc_sampling_ext_; +}; + +// Structure containing instances of different api tables +struct HsaApiTableContainer { + HsaApiTable root; + CoreApiTable core; + AmdExtTable amd_ext; + FinalizerExtTable finalizer_ext; + ImageExtTable image_ext; + ToolsApiTable tools; + PcSamplingExtTable pc_sampling_ext; + + // Default initialization of a container instance + HsaApiTableContainer() { + root.version.major_id = HSA_API_TABLE_MAJOR_VERSION; + root.version.minor_id = sizeof(HsaApiTable); + root.version.step_id = HSA_API_TABLE_STEP_VERSION; + + core.version.major_id = HSA_CORE_API_TABLE_MAJOR_VERSION; + core.version.minor_id = sizeof(CoreApiTable); + core.version.step_id = HSA_CORE_API_TABLE_STEP_VERSION; + root.core_ = &core; + + amd_ext.version.major_id = HSA_AMD_EXT_API_TABLE_MAJOR_VERSION; + amd_ext.version.minor_id = sizeof(AmdExtTable); + amd_ext.version.step_id = HSA_AMD_EXT_API_TABLE_STEP_VERSION; + root.amd_ext_ = &amd_ext; + + finalizer_ext.version.major_id = HSA_FINALIZER_API_TABLE_MAJOR_VERSION; + finalizer_ext.version.minor_id = sizeof(FinalizerExtTable); + finalizer_ext.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION; + root.finalizer_ext_ = &finalizer_ext; + + image_ext.version.major_id = HSA_IMAGE_API_TABLE_MAJOR_VERSION; + image_ext.version.minor_id = sizeof(ImageExtTable); + image_ext.version.step_id = HSA_IMAGE_API_TABLE_STEP_VERSION; + root.image_ext_ = &image_ext; + + tools.version.major_id = HSA_TOOLS_API_TABLE_MAJOR_VERSION; + tools.version.minor_id = sizeof(ToolsApiTable); + tools.version.step_id = HSA_TOOLS_API_TABLE_STEP_VERSION; + root.tools_ = &tools; + + pc_sampling_ext.version.major_id = HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION; + pc_sampling_ext.version.minor_id = sizeof(PcSamplingExtTable); + pc_sampling_ext.version.step_id = HSA_PC_SAMPLING_API_TABLE_STEP_VERSION; + root.pc_sampling_ext_ = &pc_sampling_ext; + } +}; + +// Api to copy function pointers of a table +static +void inline copyApi(void* src, void* dest, size_t size) { + assert(size >= sizeof(ApiTableVersion)); + memcpy((char*)src + sizeof(ApiTableVersion), + (char*)dest + sizeof(ApiTableVersion), + (size - sizeof(ApiTableVersion))); +} + +// Copy Api child tables if valid. +static void inline copyElement(ApiTableVersion* dest, ApiTableVersion* src) { + if (src->major_id && (dest->major_id == src->major_id)) { + dest->step_id = src->step_id; + dest->minor_id = Min(dest->minor_id, src->minor_id); + copyApi(dest, src, dest->minor_id); + } else { + dest->major_id = 0; + dest->minor_id = 0; + dest->step_id = 0; + } +} + +// Copy constructor for all Api tables. The function assumes the +// user has initialized an instance of tables container correctly +// for the Major, Minor and Stepping Ids of Root and Child Api tables. +// The function will overwrite the value of Minor Id by taking the +// minimum of source and destination parameters. It will also overwrite +// the stepping Id with value from source parameter. +static void inline copyTables(const HsaApiTable* src, HsaApiTable* dest) { + // Verify Major Id of source and destination tables match + if (dest->version.major_id != src->version.major_id) { + dest->version.major_id = 0; + dest->version.minor_id = 0; + dest->version.step_id = 0; + return; + } + + // Initialize the stepping id and minor id of root table. For the + // minor id which encodes struct size, take the minimum of source + // and destination parameters + dest->version.step_id = src->version.step_id; + dest->version.minor_id = Min(dest->version.minor_id, src->version.minor_id); + + // Copy child tables if present + if ((offsetof(HsaApiTable, core_) < dest->version.minor_id)) + copyElement(&dest->core_->version, &src->core_->version); + if ((offsetof(HsaApiTable, amd_ext_) < dest->version.minor_id)) + copyElement(&dest->amd_ext_->version, &src->amd_ext_->version); + if ((offsetof(HsaApiTable, finalizer_ext_) < dest->version.minor_id)) + copyElement(&dest->finalizer_ext_->version, &src->finalizer_ext_->version); + if ((offsetof(HsaApiTable, image_ext_) < dest->version.minor_id)) + copyElement(&dest->image_ext_->version, &src->image_ext_->version); + if ((offsetof(HsaApiTable, tools_) < dest->version.minor_id)) + copyElement(&dest->tools_->version, &src->tools_->version); + if ((offsetof(HsaApiTable, pc_sampling_ext_) < dest->version.minor_id)) + copyElement(&dest->pc_sampling_ext_->version, &src->pc_sampling_ext_->version); +} +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace_version.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace_version.h new file mode 100644 index 0000000000..6cf1054823 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_api_trace_version.h @@ -0,0 +1,70 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H +#define HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H + +// CODE IN THIS FILE **MUST** BE C-COMPATIBLE + +// Major Ids of the Api tables exported by Hsa Core Runtime +#define HSA_API_TABLE_MAJOR_VERSION 0x03 +#define HSA_CORE_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_IMAGE_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_TOOLS_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION 0x01 + +// Step Ids of the Api tables exported by Hsa Core Runtime +#define HSA_API_TABLE_STEP_VERSION 0x01 +#define HSA_CORE_API_TABLE_STEP_VERSION 0x00 +#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x08 +#define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00 +#define HSA_IMAGE_API_TABLE_STEP_VERSION 0x01 +// Rocprofiler just checks HSA_MAGE_EXT_API_TABLE_STEP_VERSION +#define HSA_IMAGE_EXT_API_TABLE_STEP_VERSION HSA_IMAGE_API_TABLE_STEP_VERSION +#define HSA_AQLPROFILE_API_TABLE_STEP_VERSION 0x00 +#define HSA_TOOLS_API_TABLE_STEP_VERSION 0x00 +#define HSA_PC_SAMPLING_API_TABLE_STEP_VERSION 0x00 + +#endif // HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_amd.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_amd.h new file mode 100644 index 0000000000..3fd1f9348e --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_amd.h @@ -0,0 +1,3782 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA AMD extension. + +#ifndef HSA_RUNTIME_EXT_AMD_H_ +#define HSA_RUNTIME_EXT_AMD_H_ + +#include "hsa.h" +#include "hsa_ext_image.h" +#include "hsa_ven_amd_pc_sampling.h" + +/** + * - 1.0 - initial version + * - 1.1 - dmabuf export + * - 1.2 - hsa_amd_memory_async_copy_on_engine + * - 1.3 - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED pool + * - 1.4 - Virtual Memory API + * - 1.5 - hsa_amd_agent_info: HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES + * - 1.6 - Virtual Memory API: hsa_amd_vmem_address_reserve_align + * - 1.7 - hsa_amd_signal_wait_all + * - 1.8 - hsa_amd_memory_get_preferred_copy_engine + * - 1.9 - hsa_amd_portable_export_dmabuf_v2 + * - 1.10 - hsa_amd_vmem_address_reserve: HSA_AMD_VMEM_ADDRESS_NO_REGISTER + * - 1.11 - hsa_amd_agent_info_t: HSA_AMD_AGENT_INFO_CLOCK_COUNTERS + * - 1.12 - hsa_amd_pointer_info: HSA_EXT_POINTER_TYPE_HSA_VMEM and HSA_EXT_POINTER_TYPE_RESERVED_ADDR + * - 1.13 - hsa_amd_pointer_info: Added new registered field to hsa_amd_pointer_info_t + * - 1.14 - hsa_amd_ais_file_write, hsa_amd_ais_file_read + */ +#define HSA_AMD_INTERFACE_VERSION_MAJOR 1 +#define HSA_AMD_INTERFACE_VERSION_MINOR 14 + +#ifdef __cplusplus +extern "C" { +#endif + +/** \addtogroup aql Architected Queuing Language + * @{ + */ + +/** + * @brief Macro to set a flag within uint8_t[8] types. + */ +static inline void hsa_flag_set64(uint8_t* value, uint32_t bit) { + unsigned int index = bit / 8; + unsigned int subBit = bit % 8; + (((uint8_t*)value)[index]) |= (1 << subBit); +} + +/** + * @brief Macro to determine whether a flag is set within uint8_t[8] types. + */ +static inline bool hsa_flag_isset64(uint8_t* value, uint32_t bit) { + unsigned int index = bit / 8; + unsigned int subBit = bit % 8; + return ((uint8_t*)value)[index] & (1 << subBit); +} + +/** + * @brief A fixed-size type used to represent ::hsa_signal_condition_t constants. + */ +typedef uint32_t hsa_signal_condition32_t; + +/** + * @brief AMD vendor specific packet type. + */ +typedef enum { + /** + * Packet used by agents to delay processing of subsequent packets until a + * configurable condition is satisfied by an HSA signal. Only kernel dispatch + * queues created from AMD GPU Agents support this packet. + */ + HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2, + /** + * Packet used to send commands to an AIE agent's embedded runtime (ERT). The + * ERT is responsible for, among other things, handling dispatches. Only + * queues created on AIE agents support this packet. + */ + HSA_AMD_PACKET_TYPE_AIE_ERT = 3 +} hsa_amd_packet_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_amd_packet_type_t constants. + */ +typedef uint8_t hsa_amd_packet_type8_t; + +/** + * @brief AMD vendor specific AQL packet header + */ +typedef struct hsa_amd_packet_header_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Format of the vendor specific packet. + */ + hsa_amd_packet_type8_t AmdFormat; + + /** + * Reserved. Must be 0. + */ + uint8_t reserved; +} hsa_amd_vendor_packet_header_t; + +/** + * @brief AMD barrier value packet. Halts packet processing and waits for + * (signal_value & ::mask) ::cond ::value to be satisfied, where signal_value + * is the value of the signal ::signal. + */ +typedef struct hsa_amd_barrier_value_packet_s { + /** + * AMD vendor specific packet header. + */ + hsa_amd_vendor_packet_header_t header; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; + + /** + * Dependent signal object. A signal with a handle value of 0 is + * allowed and is interpreted by the packet processor a satisfied + * dependency. + */ + hsa_signal_t signal; + + /** + * Value to compare against. + */ + hsa_signal_value_t value; + + /** + * Bit mask to be combined by bitwise AND with ::signal's value. + */ + hsa_signal_value_t mask; + + /** + * Comparison operation. See ::hsa_signal_condition_t. + */ + hsa_signal_condition32_t cond; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved3; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; +} hsa_amd_barrier_value_packet_t; + +/** + * State of an AIE ERT command. + */ +typedef enum { + /** + * Set by the host before submitting a command to the scheduler. + */ + HSA_AMD_AIE_ERT_STATE_NEW = 1, + /** + * Internal scheduler state. + */ + HSA_AMD_AIE_ERT_STATE_QUEUED = 2, + /** + * Internal scheduler state. + */ + HSA_AMD_AIE_ERT_STATE_RUNNING = 3, + /** + * Set by the scheduler when a command completes. + */ + HSA_AMD_AIE_ERT_STATE_COMPLETED = 4, + /** + * Set by the scheduler if a command failed. + */ + HSA_AMD_AIE_ERT_STATE_ERROR = 5, + /** + * Set by the scheduler if a command aborted. + */ + HSA_AMD_AIE_ERT_STATE_ABORT = 6, + /** + * Internal scheduler state. + */ + HSA_AMD_AIE_ERT_STATE_SUBMITTED = 7, + /** + * Set by the scheduler on a timeout and reset. + */ + HSA_AMD_AIE_ERT_STATE_TIMEOUT = 8, + /** + * Set by the scheduler on a timeout and fail to reset. + */ + HSA_AMD_AIE_ERT_STATE_NORESPONSE = 9, + HSA_AMD_AIE_ERT_STATE_SKERROR = 10, + HSA_AMD_AIE_ERT_STATE_SKCRASHED = 11, + HSA_AMD_AIE_ERT_STATE_MAX +} hsa_amd_aie_ert_state; + +/** + * Opcode types for HSA AIE ERT commands. + */ +typedef enum { + /** + * Start a workgroup on a compute unit (CU). + */ + HSA_AMD_AIE_ERT_START_CU = 0, + /** + * Currently aliased to HSA_AMD_AIE_ERT_START_CU. + */ + HSA_AMD_AIE_ERT_START_KERNEL = 0, + /** + * Configure command scheduler. + */ + HSA_AMD_AIE_ERT_CONFIGURE = 2, + HSA_AMD_AIE_ERT_EXIT = 3, + HSA_AMD_AIE_ERT_ABORT = 4, + /** + * Execute a specified CU after writing. + */ + HSA_AMD_AIE_ERT_EXEC_WRITE = 5, + /** + * Get stats about a CU's execution. + */ + HSA_AMD_AIE_ERT_CU_STAT = 6, + /** + * Start KDMA CU or P2P. + */ + HSA_AMD_AIE_ERT_START_COPYBO = 7, + /** + * Configure a soft kernel. + */ + HSA_AMD_AIE_ERT_SK_CONFIG = 8, + /** + * Start a soft kernel. + */ + HSA_AMD_AIE_ERT_SK_START = 9, + /** + * Unconfigure a soft kernel. + */ + HSA_AMD_AIE_ERT_SK_UNCONFIG = 10, + /** + * Initialize a CU. + */ + HSA_AMD_AIE_ERT_INIT_CU = 11, + HSA_AMD_AIE_ERT_START_FA = 12, + HSA_AMD_AIE_ERT_CLK_CALIB = 13, + HSA_AMD_AIE_ERT_MB_VALIDATE = 14, + /** + * Same as HSA_AMD_AIE_ERT_START_CU but with a key-value pair. + */ + HSA_AMD_AIE_ERT_START_KEY_VAL = 15, + HSA_AMD_AIE_ERT_ACCESS_TEST_C = 16, + HSA_AMD_AIE_ERT_ACCESS_TEST = 17, + /** + * Instruction buffer command format. + */ + HSA_AMD_AIE_ERT_START_DPU = 18, + /** + * Command chain. + */ + HSA_AMD_AIE_ERT_CMD_CHAIN = 19, + /** + * Instruction buffer command format on NPU. + */ + HSA_AMD_AIE_ERT_START_NPU = 20, + /** + * Instruction buffer command with pre-emption format on the NPU. + */ + HSA_AMD_AIE_ERT_START_NPU_PREEMPT = 21 +} hsa_amd_aie_ert_cmd_opcode_t; + +/** + * Payload data for AIE ERT start kernel packets (i.e., when the opcode is + * HSA_AMD_AIE_ERT_START_KERNEL). + */ +typedef struct hsa_amd_aie_ert_start_kernel_data_s { + /** + * Address to the PDI. + */ + void* pdi_addr; + /** + * Opcode, instructions and kernel arguments. + */ + uint32_t data[]; +} hsa_amd_aie_ert_start_kernel_data_t; + +/** + * AMD AIE ERT packet. Used for sending a command to an AIE agent. + */ +typedef struct hsa_amd_aie_ert_packet_s { + /** + * AMD vendor specific packet header. + */ + hsa_amd_vendor_packet_header_t header; + /** + * Format for packets interpreted by the ERT to understand the command and + * payload data. + */ + struct { + /** + * Current state of a command. + */ + uint32_t state : 4; + /** + * Flexible field that can be interpreted on a per-command basis. + */ + uint32_t custom : 8; + /** + * Number of DWORDs in the payload data. + */ + uint32_t count : 11; + /** + * Opcode identifying the command. + */ + uint32_t opcode : 5; + /** + * Type of a command (currently 0). + */ + uint32_t type : 4; + }; + /** + * Reserved. Must be 0. + */ + uint64_t reserved0; + /** + * Reserved. Must be 0. + */ + uint64_t reserved1; + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + /** + * Reserved. Must be 0. + */ + uint64_t reserved3; + /** + * Reserved. Must be 0. + */ + uint64_t reserved4; + /** + * Reserved. Must be 0. + */ + uint64_t reserved5; + /** + * Address of packet data payload. ERT commands contain arbitrarily sized + * data payloads. + */ + uint64_t payload_data; +} hsa_amd_aie_ert_packet_t; + +/** @} */ + +/** \defgroup error-codes Error codes + * @{ + */ + +/** + * @brief Enumeration constants added to ::hsa_status_t. + * + * @remark Additions to hsa_status_t + */ +enum { + /** + * The memory pool is invalid. + */ + HSA_STATUS_ERROR_INVALID_MEMORY_POOL = 40, + + /** + * Agent accessed memory beyond the maximum legal address. + */ + HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION = 41, + + /** + * Agent executed an invalid shader instruction. + */ + HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION = 42, + + /** + * Agent attempted to access an inaccessible address. + * See hsa_amd_register_system_event_handler and + * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses. + */ + HSA_STATUS_ERROR_MEMORY_FAULT = 43, + + /** + * The CU mask was successfully set but the mask attempted to enable a CU + * which was disabled for the process. CUs disabled for the process remain + * disabled. + */ + HSA_STATUS_CU_MASK_REDUCED = 44, + + /** + * Exceeded number of VGPRs available on this agent + */ + HSA_STATUS_ERROR_OUT_OF_REGISTERS = 45, + + /** + * Resource is busy or temporarily unavailable + */ + HSA_STATUS_ERROR_RESOURCE_BUSY = 46, + + /** + * Request is not supported by this system + */ + HSA_STATUS_ERROR_NOT_SUPPORTED = 47, +}; + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief IOMMU version supported + */ +typedef enum { + /** + * IOMMU not supported + */ + HSA_IOMMU_SUPPORT_NONE = 0, + /* IOMMU V1 support is not relevant to user applications, so not reporting it */ + /** + * IOMMU V2 supported + */ + HSA_IOMMU_SUPPORT_V2 = 1, +} hsa_amd_iommu_version_t; + +/** + * @brief Structure containing information on the agent's clock counters. + */ +typedef struct hsa_amd_clock_counters_s { + uint64_t gpu_clock_counter; + uint64_t cpu_clock_counter; + uint64_t system_clock_counter; + uint64_t system_clock_frequency; +} hsa_amd_clock_counters_t; + +/** + * @brief Agent attributes. + */ +typedef enum hsa_amd_agent_info_s { + /** + * Chip identifier. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000, + /** + * Size of a cacheline in bytes. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001, + /** + * The number of compute unit available in the agent. The type of this + * attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002, + /** + * The maximum clock frequency of the agent in MHz. The type of this + * attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003, + /** + * Internal driver node identifier. The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_DRIVER_NODE_ID = 0xA004, + /** + * Max number of watch points on memory address ranges to generate exception + * events when the watched addresses are accessed. The type of this + * attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS = 0xA005, + /** + * Agent BDF_ID, named LocationID in thunk. The type of this attribute is + * uint32_t. + */ + HSA_AMD_AGENT_INFO_BDFID = 0xA006, + /** + * Memory Interface width, the return value type is uint32_t. + * This attribute is deprecated. + */ + HSA_AMD_AGENT_INFO_MEMORY_WIDTH = 0xA007, + /** + * Max Memory Clock, the return value type is uint32_t. + */ + HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY = 0xA008, + /** + * Board name of Agent - populated from MarketingName of Kfd Node + * The value is an Ascii string of 64 chars. + */ + HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009, + /** + * Maximum number of waves possible in a Compute Unit. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A, + /** + * Number of SIMD's per compute unit CU + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B, + /** + * Number of Shader Engines (SE) in Gpu + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES = 0xA00C, + /** + * Number of Shader Arrays Per Shader Engines in Gpu + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE = 0xA00D, + /** + * Address of the HDP flush registers. Use of these registers does not conform to the HSA memory + * model and should be treated with caution. + * The type of this attribute is hsa_amd_hdp_flush_t. + */ + HSA_AMD_AGENT_INFO_HDP_FLUSH = 0xA00E, + /** + * PCIe domain for the agent. Pairs with HSA_AMD_AGENT_INFO_BDFID + * to give the full physical location of the Agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_DOMAIN = 0xA00F, + /** + * Queries for support of cooperative queues. See ::HSA_QUEUE_TYPE_COOPERATIVE. + * The type of this attribute is bool. + */ + HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010, + /** + * Queries UUID of an agent. The value is an Ascii string with a maximum + * of 21 chars including NUL. The string value consists of two parts: header + * and body. The header identifies device type (GPU, CPU, DSP) while body + * encodes UUID as a 16 digit hex string + * + * Agents that do not support UUID will return the string "GPU-XX" or + * "CPU-XX" or "DSP-XX" depending upon their device type ::hsa_device_type_t + */ + HSA_AMD_AGENT_INFO_UUID = 0xA011, + /** + * Queries for the ASIC revision of an agent. The value is an integer that + * increments for each revision. This can be used by user-level software to + * change how it operates, depending on the hardware version. This allows + * selective workarounds for hardware errata. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_ASIC_REVISION = 0xA012, + /** + * Queries whether or not the host can directly access SVM memory that is + * physically resident in the agent's local memory. + * The type of this attribute is bool. + */ + HSA_AMD_AGENT_INFO_SVM_DIRECT_HOST_ACCESS = 0xA013, + /** + * Some processors support more CUs than can reliably be used in a cooperative + * dispatch. This queries the count of CUs which are fully enabled for + * cooperative dispatch. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT = 0xA014, + /** + * Queries the amount of memory available in bytes accross all global pools + * owned by the agent. + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_MEMORY_AVAIL = 0xA015, + /** + * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is + * in the range 1-400MHz. + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016, + /** + * Queries for the ASIC family ID of an agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_ASIC_FAMILY_ID = 0xA107, + /** + * Queries for the Packet Processor(CP Firmware) ucode version of an agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_UCODE_VERSION = 0xA108, + /** + * Queries for the SDMA engine ucode of an agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_SDMA_UCODE_VERSION = 0xA109, + /** + * Queries the number of SDMA engines. + * If HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG query returns non-zero, + * this query returns the the number of SDMA engines optimized for + * host to device bidirectional traffic. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SDMA_ENG = 0xA10A, + /** + * Queries the number of additional SDMA engines optimized for D2D xGMI copies. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG = 0xA10B, + /** + * Queries for version of IOMMU supported by agent. + * The type of this attribute is hsa_amd_iommu_version_t. + */ + HSA_AMD_AGENT_INFO_IOMMU_SUPPORT = 0xA110, + /** + * Queries for number of XCCs within the agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_XCC = 0xA111, + /** + * Queries for driver unique identifier. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_DRIVER_UID = 0xA112, + /** + * Returns the hsa_agent_t of the nearest CPU agent + * The type of this attribute is hsa_agent_t. + */ + HSA_AMD_AGENT_INFO_NEAREST_CPU = 0xA113, + /** + * Bit-mask indicating memory properties of this agent. A memory property is set if the flag bit + * is set at that position. User may use the hsa_flag_isset64 macro to verify whether a flag + * is set. The type of this attribute is uint8_t[8]. + */ + HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES = 0xA114, + /** + * Bit-mask indicating AQL Extensions supported by this agent. An AQL extension is set if the flag + * bit is set at that position. User may use the hsa_flag_isset64 macro to verify whether a flag + * is set. The type of this attribute is uint8_t[8]. + */ + HSA_AMD_AGENT_INFO_AQL_EXTENSIONS = 0xA115, /* Not implemented yet */ + /** + * Maximum allowed value in bytes for scratch limit for this agent. This amount + * is shared accross all queues created on this agent. + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_MAX = 0xA116, + /** + * Current scratch limit threshold in bytes for this agent. This limit can be + * modified using the hsa_amd_agent_set_async_scratch_limit call. + * - AQL dispatches that require scratch-memory above this threshold will trigger a + * scratch use-once. + * - AQL dispatches using less scratch-memory than this threshold, ROCr will + * permanently assign the allocated scratch memory to the queue handling the dispatch. + * This memory can be reclaimed by calling hsa_amd_agent_set_async_scratch_limit + * with a lower threshold by current value. + * + * The type of this attribute is uint64_t. + */ + HSA_AMD_AGENT_INFO_SCRATCH_LIMIT_CURRENT = 0xA117, + /** + * Queries the driver for clock counters of the agent. + * The type of this attribute is hsa_amd_clock_counters_t. + */ + HSA_AMD_AGENT_INFO_CLOCK_COUNTERS = 0xA118 +} hsa_amd_agent_info_t; + +/** + * @brief Agent memory properties attributes + */ +typedef enum hsa_amd_agent_memory_properties_s { + HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU = (1 << 0), +} hsa_amd_agent_memory_properties_t; + +/** + * @brief SDMA engine IDs unique by single set bit position. + */ +typedef enum hsa_amd_sdma_engine_id { + HSA_AMD_SDMA_ENGINE_0 = 0x1, + HSA_AMD_SDMA_ENGINE_1 = 0x2, + HSA_AMD_SDMA_ENGINE_2 = 0x4, + HSA_AMD_SDMA_ENGINE_3 = 0x8, + HSA_AMD_SDMA_ENGINE_4 = 0x10, + HSA_AMD_SDMA_ENGINE_5 = 0x20, + HSA_AMD_SDMA_ENGINE_6 = 0x40, + HSA_AMD_SDMA_ENGINE_7 = 0x80, + HSA_AMD_SDMA_ENGINE_8 = 0x100, + HSA_AMD_SDMA_ENGINE_9 = 0x200, + HSA_AMD_SDMA_ENGINE_10 = 0x400, + HSA_AMD_SDMA_ENGINE_11 = 0x800, + HSA_AMD_SDMA_ENGINE_12 = 0x1000, + HSA_AMD_SDMA_ENGINE_13 = 0x2000, + HSA_AMD_SDMA_ENGINE_14 = 0x4000, + HSA_AMD_SDMA_ENGINE_15 = 0x8000 +} hsa_amd_sdma_engine_id_t; + +typedef struct hsa_amd_hdp_flush_s { + uint32_t* HDP_MEM_FLUSH_CNTL; + uint32_t* HDP_REG_FLUSH_CNTL; +} hsa_amd_hdp_flush_t; + +/** + * @brief Region attributes. + */ +#ifdef __cplusplus +typedef enum hsa_amd_region_info_s : int { +#else +typedef enum hsa_amd_region_info_s { +#endif + /** + * Determine if host can access the region. The type of this attribute + * is bool. + */ + HSA_AMD_REGION_INFO_HOST_ACCESSIBLE = 0xA000, + /** + * Base address of the region in flat address space. + */ + HSA_AMD_REGION_INFO_BASE = 0xA001, + /** + * Memory Interface width, the return value type is uint32_t. + * This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_WIDTH. + */ + HSA_AMD_REGION_INFO_BUS_WIDTH = 0xA002, + /** + * Max Memory Clock, the return value type is uint32_t. + * This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY. + */ + HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY = 0xA003, +} hsa_amd_region_info_t; + +/** + * @brief Coherency attributes of fine grain region. + */ +typedef enum hsa_amd_coherency_type_s { + /** + * Coherent region. + */ + HSA_AMD_COHERENCY_TYPE_COHERENT = 0, + /** + * Non coherent region. + */ + HSA_AMD_COHERENCY_TYPE_NONCOHERENT = 1 +} hsa_amd_coherency_type_t; + + +/** + * @brief dmabuf attributes + */ +#ifdef __cplusplus +typedef enum hsa_amd_dma_buf_mapping_type_s : int { +#else +typedef enum hsa_amd_dma_buf_mapping_type_s { +#endif + HSA_AMD_DMABUF_MAPPING_TYPE_NONE = 0, + HSA_AMD_DMABUF_MAPPING_TYPE_PCIE = 1 +} hsa_amd_dma_buf_mapping_type_t; +/** + * @brief Get the coherency type of the fine grain region of an agent. + * + * @param[in] agent A valid agent. + * + * @param[out] type Pointer to a memory location where the HSA runtime will + * store the coherency type of the fine grain region. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is NULL. + */ +hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent, + hsa_amd_coherency_type_t* type); + +/** + * @brief Set the coherency type of the fine grain region of an agent. + * Deprecated. This is supported on KV platforms. For backward compatibility + * other platforms will spuriously succeed. + * + * @param[in] agent A valid agent. + * + * @param[in] type The coherency type to be set. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is invalid. + */ +hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent, + hsa_amd_coherency_type_t type); + +/** @} */ + +/** \defgroup profile Profiling + * @{ + */ + +/** + * @brief Structure containing profiling dispatch time information. + * + * Times are reported as ticks in the domain of the HSA system clock. + * The HSA system clock tick and frequency is obtained via hsa_system_get_info. + */ +typedef struct hsa_amd_profiling_dispatch_time_s { + /** + * Dispatch packet processing start time. + */ + uint64_t start; + /** + * Dispatch packet completion time. + */ + uint64_t end; +} hsa_amd_profiling_dispatch_time_t; + +/** + * @brief Structure containing profiling async copy time information. + * + * Times are reported as ticks in the domain of the HSA system clock. + * The HSA system clock tick and frequency is obtained via hsa_system_get_info. + */ +typedef struct hsa_amd_profiling_async_copy_time_s { + /** + * Async copy processing start time. + */ + uint64_t start; + /** + * Async copy completion time. + */ + uint64_t end; +} hsa_amd_profiling_async_copy_time_t; + +/** + * @brief Enable or disable profiling capability of a queue. + * + * @param[in] queue A valid queue. + * + * @param[in] enable 1 to enable profiling. 0 to disable profiling. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API + hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable); + +/** + * @brief Enable or disable asynchronous memory copy profiling. + * + * @details The runtime will provide the copy processing start timestamp and + * completion timestamp of each call to hsa_amd_memory_async_copy if the + * async copy profiling is enabled prior to the call to + * hsa_amd_memory_async_copy. The completion signal object is used to + * hold the last async copy start and end timestamp. The client can retrieve + * these timestamps via call to hsa_amd_profiling_get_async_copy_time. + * + * @param[in] enable True to enable profiling. False to disable profiling. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed on allocating resources + * needed to profile the asynchronous copy. + */ +hsa_status_t HSA_API + hsa_amd_profiling_async_copy_enable(bool enable); + +/** + * @brief Retrieve packet processing time stamps. + * + * @param[in] agent The agent with which the signal was last used. For + * instance, if the profiled dispatch packet is dispatched onto queue Q, + * which was created on agent A, then this parameter must be A. + * + * @param[in] signal A signal used as the completion signal of the dispatch + * packet to retrieve time stamps from. This dispatch packet must have been + * issued to a queue with profiling enabled and have already completed. Also + * the signal must not have yet been used in any other packet following the + * completion of the profiled dispatch packet. + * + * @param[out] time Packet processing timestamps in the HSA system clock + * domain. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL. + */ +hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time( + hsa_agent_t agent, hsa_signal_t signal, + hsa_amd_profiling_dispatch_time_t* time); + +/** + * @brief Retrieve asynchronous copy timestamps. + * + * @details Async copy profiling is enabled via call to + * hsa_amd_profiling_async_copy_enable. + * + * @param[in] signal A signal used as the completion signal of the call to + * hsa_amd_memory_async_copy. + * + * @param[out] time Async copy processing timestamps in the HSA system clock + * domain. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL. + */ +hsa_status_t HSA_API hsa_amd_profiling_get_async_copy_time( + hsa_signal_t signal, hsa_amd_profiling_async_copy_time_t* time); + +/** + * @brief Computes the frequency ratio and offset between the agent clock and + * HSA system clock and converts the agent's tick to HSA system domain tick. + * + * @param[in] agent The agent used to retrieve the agent_tick. It is user's + * responsibility to make sure the tick number is from this agent, otherwise, + * the behavior is undefined. + * + * @param[in] agent_tick The tick count retrieved from the specified @p agent. + * + * @param[out] system_tick The translated HSA system domain clock counter tick. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p system_tick is NULL; + */ +hsa_status_t HSA_API + hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent, + uint64_t agent_tick, + uint64_t* system_tick); + +/** @} */ + +/** \defgroup status Runtime notifications + * @{ + */ + +/** + * @brief Signal attribute flags. + */ +typedef enum { + /** + * Signal will only be consumed by AMD GPUs. Limits signal consumption to + * AMD GPU agents only. Ignored if @p num_consumers is not zero (all agents). + */ + HSA_AMD_SIGNAL_AMD_GPU_ONLY = 1, + /** + * Signal may be used for interprocess communication. + * IPC signals can be read, written, and waited on from any process. + * Profiling using an IPC enabled signal is only supported in a single process + * at a time. Producing profiling data in one process and consuming it in + * another process is undefined. + */ + HSA_AMD_SIGNAL_IPC = 2, +} hsa_amd_signal_attribute_t; + +/** + * @brief Create a signal with specific attributes. + * + * @param[in] initial_value Initial value of the signal. + * + * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that + * any agent might wait on the signal. + * + * @param[in] consumers List of agents that might consume (wait on) the + * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the + * HSA runtime might use the list to optimize the handling of the signal + * object. If an agent not listed in @p consumers waits on the returned + * signal, the behavior is undefined. The memory associated with @p consumers + * can be reused or freed after the function returns. + * + * @param[in] attributes Requested signal attributes. Multiple signal attributes + * may be requested by combining them with bitwise OR. Requesting no attributes + * (@p attributes == 0) results in the same signal as would have been obtained + * via hsa_signal_create. + * + * @param[out] signal Pointer to a memory location where the HSA runtime will + * store the newly created signal handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p + * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers + * contains duplicates. + */ +hsa_status_t HSA_API hsa_amd_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers, + const hsa_agent_t* consumers, uint64_t attributes, + hsa_signal_t* signal); + +/** + * @brief Returns a pointer to the value of a signal. + * + * Use of this API does not modify the lifetime of ::signal and any + * hsa_signal_value_t retrieved by this API has lifetime equal to that of + * ::signal. + * + * This API is intended for partial interoperability with non-HSA compatible + * devices and should not be used where HSA interfaces are available. + * + * Use of the signal value must comply with use restritions of ::signal. + * Use may result in data races if the operations performed are not platform + * atomic. Use with HSA_AMD_SIGNAL_AMD_GPU_ONLY or HSA_AMD_SIGNAL_IPC + * attributed signals is required. + * + * @param[in] Signal handle to extract the signal value pointer from. + * + * @param[out] Location where the extracted signal value pointer will be placed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT value_ptr is NULL. + */ +hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal, + volatile hsa_signal_value_t** value_ptr); + +/** + * @brief Asyncronous signal handler function type. + * + * @details Type definition of callback function to be used with + * hsa_amd_signal_async_handler. This callback is invoked if the associated + * signal and condition are met. The callback receives the value of the signal + * which satisfied the associated wait condition and a user provided value. If + * the callback returns true then the callback will be called again if the + * associated signal and condition are satisfied again. If the callback returns + * false then it will not be called again. + * + * @param[in] value Contains the value of the signal observed by + * hsa_amd_signal_async_handler which caused the signal handler to be invoked. + * + * @param[in] arg Contains the user provided value given when the signal handler + * was registered with hsa_amd_signal_async_handler + * + * @retval true resumes monitoring the signal with this handler (as if calling + * hsa_amd_signal_async_handler again with identical parameters) + * + * @retval false stops monitoring the signal with this handler (handler will + * not be called again for this signal) + * + */ +typedef bool (*hsa_amd_signal_handler)(hsa_signal_value_t value, void* arg); + +/** + * @brief Register asynchronous signal handler function. + * + * @details Allows registering a callback function and user provided value with + * a signal and wait condition. The callback will be invoked if the associated + * signal and wait condition are satisfied. Callbacks will be invoked serially + * but in an arbitrary order so callbacks should be independent of each other. + * After being invoked a callback may continue to wait for its associated signal + * and condition and, possibly, be invoked again. Or the callback may stop + * waiting. If the callback returns true then it will continue waiting and may + * be called again. If false then the callback will not wait again and will not + * be called again for the associated signal and condition. It is possible to + * register the same callback multiple times with the same or different signals + * and/or conditions. Each registration of the callback will be treated entirely + * independently. + * + * @param[in] signal hsa signal to be asynchronously monitored + * + * @param[in] cond condition value to monitor for + * + * @param[in] value signal value used in condition expression + * + * @param[in] handler asynchronous signal handler invoked when signal's + * condition is met + * + * @param[in] arg user provided value which is provided to handler when handler + * is invoked + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL) + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of + * resources or blocking signals are not supported by the HSA driver component. + * + */ +hsa_status_t HSA_API + hsa_amd_signal_async_handler(hsa_signal_t signal, + hsa_signal_condition_t cond, + hsa_signal_value_t value, + hsa_amd_signal_handler handler, void* arg); + +/** + * @brief Wait for all signal-condition pairs to be satisfied. + * + * @details Allows waiting for all of several signal and condition pairs to be + * satisfied. The function returns 0 if all signals met their conditions and -1 + * on a timeout. The value of each signal's satisfying value is returned in + * satisfying_value unless satisfying_value is nullptr. NULL and invalid signals + * are considered to have value 0 and their conditions already satisfied. This + * function provides only relaxed memory semantics. + */ +uint32_t HSA_API hsa_amd_signal_wait_all(uint32_t signal_count, hsa_signal_t* signals, + hsa_signal_condition_t* conds, hsa_signal_value_t* values, + uint64_t timeout_hint, hsa_wait_state_t wait_hint, + hsa_signal_value_t* satisfying_values); + +/** + * @brief Wait for any signal-condition pair to be satisfied. + * + * @details Allows waiting for any of several signal and conditions pairs to be + * satisfied. The function returns the index into the list of signals of the + * first satisfying signal-condition pair. The function returns + * std::numeric_limits::max() if no valid signal is provided. The value + * of the satisfying signal's value is returned in satisfying_value, unless + * satisfying_value is nullptr or there's no valid signal in the signal-condition + * pairs. NULL and invalid signals are ignored. This function provides only + * relaxed memory semantics. + */ +uint32_t HSA_API + hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals, + hsa_signal_condition_t* conds, + hsa_signal_value_t* values, uint64_t timeout_hint, + hsa_wait_state_t wait_hint, + hsa_signal_value_t* satisfying_value); + +/** @} */ + +/** + * @brief Call a function asynchronously + * + * @details Provides access to the runtime's asynchronous event handling thread + * for general asynchronous functions. Functions queued this way are executed + * in the same manner as if they were a signal handler who's signal is + * satisfied. + * + * @param[in] callback asynchronous function to be invoked + * + * @param[in] arg user provided value which is provided to handler when handler + * is invoked + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL) + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of + * resources or blocking signals are not supported by the HSA driver component. + * + */ +hsa_status_t HSA_API + hsa_amd_async_function(void (*callback)(void* arg), void* arg); + +/** \addtogroup ext-images Images and samplers + * @{ + */ + +/** + * @brief Encodes an opaque vendor specific image format. The length of data + * depends on the underlying format. This structure must not be copied as its + * true length can not be determined. + */ +typedef struct hsa_amd_image_descriptor_s { + /* + Version number of the descriptor + */ + uint32_t version; + + /* + Vendor and device PCI IDs for the format as VENDOR_ID<<16|DEVICE_ID. + */ + uint32_t deviceID; + + /* + Start of vendor specific data. + */ + uint32_t data[1]; +} hsa_amd_image_descriptor_t; + +/** + * @brief Creates an image from an opaque vendor specific image format. + * Does not modify data at image_data. Intended initially for + * accessing interop images. + * + * @param agent[in] Agent on which to create the image + * + * @param[in] image_descriptor[in] Vendor specific image format + * + * @param[in] image_data Pointer to image backing store + * + * @param[in] access_permission Access permissions for the image object + * + * @param[out] image Created image object. + * + * @retval HSA_STATUS_SUCCESS Image created successfully + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT Bad or mismatched descriptor, + * null image_data, or mismatched access_permission. + */ +hsa_status_t HSA_API hsa_amd_image_create( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const hsa_amd_image_descriptor_t *image_layout, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image +); + +/** + * @brief Query image limits. + * + * @param[in] agent A valid agent. + * + * @param[in] attribute HSA image info attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p value is NULL or @p attribute < + * HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS or @p attribute > + * HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS. + * + */ +hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +/** @} */ + +/** \addtogroup queue Queues + * @{ + */ + +/** + * @brief Set a queue's CU affinity mask. + * + * @details Enables the queue to run on only selected CUs. The given mask is + * combined by bitwise AND with any device wide mask in HSA_CU_MASK before + * being applied. + * If num_cu_mask_count is 0 then the request is interpreted as a request to + * enable all CUs and no cu_mask array need be given. + * + * @param[in] queue A pointer to HSA queue. + * + * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits. + * + * @param[in] cu_mask Bit-vector representing the CU mask. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed + * but the given mask attempted to enable a CU which was disabled by + * HSA_CU_MASK. CUs disabled by HSA_CU_MASK remain disabled. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not + * a multiple of 32 or @p num_cu_mask_count is not 0 and cu_mask is NULL. + * Devices with work group processors must even-index contiguous pairwise + * CU enable e.g. 0x33(b'110011) is valid while 0x5(0x101) and 0x6(b'0110) + * are invalid. + * + */ +hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, + uint32_t num_cu_mask_count, + const uint32_t* cu_mask); + +/** + * @brief Retrieve a queue's CU affinity mask. + * + * @details Returns the first num_cu_mask_count bits of a queue's CU mask. + * Ensure that num_cu_mask_count is at least as large as + * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask. + * + * @param[in] queue A pointer to HSA queue. + * + * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits. + * + * @param[out] cu_mask Bit-vector representing the CU mask. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not + * a multiple of 32 or @p cu_mask is NULL. + * + */ +hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, + uint32_t* cu_mask); + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Memory segments associated with a memory pool. + */ +typedef enum { + /** + * Global segment. Used to hold data that is shared by all agents. + */ + HSA_AMD_SEGMENT_GLOBAL = 0, + /** + * Read-only segment. Used to hold data that remains constant during the + * execution of a kernel. + */ + HSA_AMD_SEGMENT_READONLY = 1, + /** + * Private segment. Used to hold data that is local to a single work-item. + */ + HSA_AMD_SEGMENT_PRIVATE = 2, + /** + * Group segment. Used to hold data that is shared by the work-items of a + * work-group. + */ + HSA_AMD_SEGMENT_GROUP = 3, +} hsa_amd_segment_t; + +/** + * @brief A memory pool encapsulates physical storage on an agent + * along with a memory access model. + * + * @details A memory pool encapsulates a physical partition of an agent's + * memory system along with a memory access model. Division of a single + * memory system into separate pools allows querying each partition's access + * path properties (see ::hsa_amd_agent_memory_pool_get_info). Allocations + * from a pool are preferentially bound to that pool's physical partition. + * Binding to the pool's preferential physical partition may not be + * possible or persistent depending on the system's memory policy + * and/or state which is beyond the scope of HSA APIs. + * + * For example, a multi-node NUMA memory system may be represented by multiple + * pool's with each pool providing size and access path information for the + * partition it represents. Allocations from a pool are preferentially bound + * to the pool's partition (which in this example is a NUMA node) while + * following its memory access model. The actual placement may vary or migrate + * due to the system's NUMA policy and state, which is beyond the scope of + * HSA APIs. + */ +typedef struct hsa_amd_memory_pool_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_amd_memory_pool_t; + +typedef enum hsa_amd_memory_pool_global_flag_s { + /** + * The application can use allocations in the memory pool to store kernel + * arguments, and provide the values for the kernarg segment of + * a kernel dispatch. + */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1, + /** + * Updates to memory in this pool conform to HSA memory consistency model. + * If this flag is set, then ::HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED + * must not be set. + */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2, + /** + * Writes to memory in this pool can be performed by a single agent at a time. + */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4, + + /** Updates to memory in this memory pool have extended scope, acting as + * system-scope atomics for variables in memory regions of this type. + * Note: On non-compliant systems, device-specific actions may be required + * for system-scope coherence. */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8, + +} hsa_amd_memory_pool_global_flag_t; + +typedef enum hsa_amd_memory_pool_location_s { + /** + * This memory pool resides on the host (CPU) + */ + HSA_AMD_MEMORY_POOL_LOCATION_CPU = 0, + /** + * This memory pool resides on a GPU + */ + HSA_AMD_MEMORY_POOL_LOCATION_GPU = 1 +} hsa_amd_memory_pool_location_t; + +/** + * @brief Memory pool features. + */ +typedef enum { + /** + * Segment where the memory pool resides. The type of this attribute is + * ::hsa_amd_segment_t. + */ + HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0, + /** + * Flag mask. The value of this attribute is undefined if the value of + * ::HSA_AMD_MEMORY_POOL_INFO_SEGMENT is not ::HSA_AMD_SEGMENT_GLOBAL. The type + * of + * this attribute is uint32_t, a bit-field of + * ::hsa_amd_memory_pool_global_flag_t + * values. + */ + HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1, + /** + * Size of this pool, in bytes. The type of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_SIZE = 2, + /** + * Indicates whether memory in this pool can be allocated using + * ::hsa_amd_memory_pool_allocate. The type of this attribute is bool. + * + * The value of this flag is always false for memory pools in the group and + * private segments. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5, + /** + * Allocation granularity of buffers allocated by + * ::hsa_amd_memory_pool_allocate + * in this memory pool. The size of a buffer allocated in this pool is a + * multiple of the value of this attribute. While this is the minimum size of + * allocation allowed, it is recommened to use + * HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE to obtain the recommended + * allocation granularity size for this pool. + * The value of this attribute is only defined if + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for + * this pool. The type of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6, + /** + * Alignment of buffers allocated by ::hsa_amd_memory_pool_allocate in this + * pool. The value of this attribute is only defined if + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool, and + * must be a power of 2. The type of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7, + /** + * This memory_pool can be made directly accessible by all the agents in the + * system (::hsa_amd_agent_memory_pool_get_info does not return + * ::HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED for any agent). The type of this + * attribute is bool. + */ + HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15, + /** + * Maximum aggregate allocation size in bytes. The type of this attribute + * is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16, + /** + * Location of this memory pool. The type of this attribute + * is hsa_amd_memory_pool_location_t. + */ + HSA_AMD_MEMORY_POOL_INFO_LOCATION = 17, + /** + * Internal block size for allocations. This would also be the recommended + * granularity size for allocations as this prevents internal fragmentation. + * The value of this attribute is only defined if + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool. + * The size of this attribute is size_t. + */ + HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE = 18, +} hsa_amd_memory_pool_info_t; + +/** + * @brief Memory pool flag used to specify allocation directives + * + */ +typedef enum hsa_amd_memory_pool_flag_s { + /** + * Allocates memory that conforms to standard HSA memory consistency model + */ + HSA_AMD_MEMORY_POOL_STANDARD_FLAG = 0, + /** + * Allocates fine grain memory type where memory ordering is per point to point + * connection. Atomic memory operations on these memory buffers are not + * guaranteed to be visible at system scope. + */ + HSA_AMD_MEMORY_POOL_PCIE_FLAG = (1 << 0), + /** + * Allocates physically contiguous memory + */ + HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG = (1 << 1), + /** + * Allocates executable memory + */ + HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG = (1 << 2), + /** + * Allocates uncached memory + */ + HSA_AMD_MEMORY_POOL_UNCACHED_FLAG = (1 << 3), +} hsa_amd_memory_pool_flag_t; + +/** + * @brief Get the current value of an attribute of a memory pool. + * + * @param[in] memory_pool A valid memory pool. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + */ +hsa_status_t HSA_API + hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool, + hsa_amd_memory_pool_info_t attribute, + void* value); + +/** + * @brief Iterate over the memory pools associated with a given agent, and + * invoke an application-defined callback on every iteration. + * + * @details An agent can directly access buffers located in some memory pool, or + * be enabled to access them by the application (see ::hsa_amd_agents_allow_access), + * yet that memory pool may not be returned by this function for that given + * agent. + * + * A memory pool of fine-grained type must be associated only with the host. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked on the same thread that called + * ::hsa_amd_agent_iterate_memory_pools, serially, once per memory pool that is + * associated with the agent. The HSA runtime passes two arguments to the + * callback: the memory pool, and the application data. If @p callback + * returns a status other than ::HSA_STATUS_SUCCESS for a particular iteration, + * the traversal stops and ::hsa_amd_agent_iterate_memory_pools returns that status + * value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data), + void* data); + +/** + * @brief Allocate a block of memory (or buffer) in the specified pool. + * + * @param[in] memory_pool Memory pool where to allocate memory from. The memory + * pool must have the ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED flag set. + * + * @param[in] size Allocation size, in bytes. Must not be zero. This value is + * rounded up to the nearest multiple of + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool. + * + * @param[in] flags A bit-field that is used to specify allocation + * directives. + * + * @param[out] ptr Pointer to the location where to store the base virtual + * address of + * the allocated block. The returned base address is aligned to the value of + * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT in @p memory_pool. If the + * allocation fails, the returned value is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES No memory is available. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The memory pool is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to + * allocate memory in @p memory_pool, or @p size is greater than + * the value of HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE in @p memory_pool. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0, + * or flags is not 0. + * + */ +hsa_status_t HSA_API + hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size, + uint32_t flags, void** ptr); + +/** + * @brief Deallocate a block of memory previously allocated using + * ::hsa_amd_memory_pool_allocate. + * + * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value + * previously returned by ::hsa_amd_memory_pool_allocate, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); + +/** + * @brief Asynchronously copy a block of memory from the location pointed to by + * @p src on the @p src_agent to the memory block pointed to by @p dst on the @p + * dst_agent. + * Because the DMA engines used may not be in the same coherency domain, the caller must ensure + * that buffers are system-level coherent. In general this requires the sending device to have + * released the buffer to system scope prior to executing the copy API and the receiving device + * must execute a system scope acquire fence prior to use of the destination buffer. + * + * @param[out] dst Buffer where the content is to be copied. + * + * @param[in] dst_agent Agent associated with the @p dst. The agent must be able to directly + * access both the source and destination buffers in their current locations. + * May be zero in which case the runtime will attempt to discover the destination agent. + * Discovery may have variable and/or high latency. + * + * @param[in] src A valid pointer to the source of data to be copied. The source + * buffer must not overlap with the destination buffer, otherwise the copy will succeed + * but contents of @p dst is undefined. + * + * @param[in] src_agent Agent associated with the @p src. The agent must be able to directly + * access both the source and destination buffers in their current locations. + * May be zero in which case the runtime will attempt to discover the destination agent. + * Discovery may have variable and/or high latency. + * + * @param[in] size Number of bytes to copy. If @p size is 0, no copy is + * performed and the function returns success. Copying a number of bytes larger + * than the size of the buffers pointed by @p dst or @p src results in undefined + * behavior. + * + * @param[in] num_dep_signals Number of dependent signals. Can be 0. + * + * @param[in] dep_signals List of signals that must be waited on before the copy + * operation starts. The copy will start after every signal has been observed with + * the value 0. The dependent signal should not include completion signal from + * hsa_amd_memory_async_copy operation to be issued in future as that can result + * in a deadlock. If @p num_dep_signals is 0, this argument is ignored. + * + * @param[in] completion_signal Signal used to indicate completion of the copy + * operation. When the copy operation is finished, the value of the signal is + * decremented. The runtime indicates that an error has occurred during the copy + * operation by setting the value of the completion signal to a negative + * number. The signal handle must not be 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. The + * application is responsible for checking for asynchronous error conditions + * (see the description of @p completion_signal). + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT An agent is invalid or no discovered agent has access. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p completion_signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination + * pointers are NULL, or the completion signal is 0. + */ +hsa_status_t HSA_API + hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + +/** + * @brief Asynchronously copy a block of memory from the location pointed to by + * @p src on the @p src_agent to the memory block pointed to by @p dst on the @p + * dst_agent on engine_id. + * + * WARNING: Concurrent use of this call with hsa_amd_memory_async_copy can result + * in resource conflicts as HSA runtime will auto assign engines with the latter + * call. Approach using both calls concurrently with caution. + * + * All param definitions are identical to hsa_amd_memory_async_copy with the + * exception of engine_id and force_copy_on_sdma. + * + * @param[in] - engine_id Target engine defined by hsa_amd_sdma_engine_id_t. + * Client should use hsa_amd_memory_copy_engine_status first to get the ID + * availability. + * + * @param[in] - force_copy_on_sdma By default, blit kernel copies are used when + * dst_agent == src_agent. Setting this to true will force the copy over SDMA1. + * + * All return definitions are identical to hsa_amd_memory_async_copy with the + * following ammendments: + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination + * pointers are NULL, or the completion signal is 0 or engine_id is improperly + * bounded. + */ +hsa_status_t HSA_API + hsa_amd_memory_async_copy_on_engine(void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal, + hsa_amd_sdma_engine_id_t engine_id, + bool force_copy_on_sdma); +/** + * @brief Reports the availability of SDMA copy engines. + * + * @param[in] dst_agent Destination agent of copy status direction. + * + * @param[in] src_agent Source agent of copy status direction. + * + * @param[out] engine_ids_mask returns available SDMA engine IDs that can be masked + * with hsa_amd_sdma_engine_id_t. + * + * @retval ::HSA_STATUS_SUCCESS Agent has available SDMA engines. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Agent does not have available SDMA engines. + * + */ +hsa_status_t HSA_API +hsa_amd_memory_copy_engine_status(hsa_agent_t dst_agent, hsa_agent_t src_agent, + uint32_t *engine_ids_mask); + /** + * @brief Returns the preferred SDMA engine mask. + * + * @param[in] dst_agent Destination agent of copy status direction. + * + * @param[in] src_agent Source agent of copy status direction. + * + * @param[out] recommended_ids_mask returns available SDMA engine IDs for max bandwidth + * that can be masked with hsa_amd_sdma_engine_id_t. Can be 0 if there is no preference + * + * @retval ::HSA_STATUS_SUCCESS For mask returned + * + */ +hsa_status_t HSA_API +hsa_amd_memory_get_preferred_copy_engine(hsa_agent_t dst_agent, hsa_agent_t src_agent, + uint32_t* recommended_ids_mask); + +/* +[Provisional API] +Pitched memory descriptor. +All elements must be 4 byte aligned. Pitch and slice are in bytes. +*/ +typedef struct hsa_pitched_ptr_s { + void* base; + size_t pitch; + size_t slice; +} hsa_pitched_ptr_t; + +/* +[Provisional API] +Copy direction flag. +*/ +typedef enum { + hsaHostToHost = 0, + hsaHostToDevice = 1, + hsaDeviceToHost = 2, + hsaDeviceToDevice = 3 +} hsa_amd_copy_direction_t; + +/* +[Provisional API] +SDMA 3D memory copy API. The same requirements must be met by src and dst as in +hsa_amd_memory_async_copy. +Both src and dst must be directly accessible to the copy_agent during the copy, src and dst rects +must not overlap. +CPU agents are not supported. API requires SDMA and will return an error if SDMA is not available. +Offsets and range carry x in bytes, y and z in rows and layers. +*/ +hsa_status_t HSA_API hsa_amd_memory_async_copy_rect( + const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent, + hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + +/** + * @brief Type of accesses to a memory pool from a given agent. + */ +typedef enum { + /** + * The agent cannot directly access any buffer in the memory pool. + */ + HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0, + /** + * The agent can directly access a buffer located in the pool; the application + * does not need to invoke ::hsa_amd_agents_allow_access. + */ + HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT = 1, + /** + * The agent can directly access a buffer located in the pool, but only if the + * application has previously requested access to that buffer using + * ::hsa_amd_agents_allow_access. + */ + HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT = 2 +} hsa_amd_memory_pool_access_t; + +/** + * @brief Properties of the relationship between an agent a memory pool. + */ +typedef enum { + /** + * Hyper-transport bus type. + */ + HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0, + + /** + * QPI bus type. + */ + HSA_AMD_LINK_INFO_TYPE_QPI = 1, + + /** + * PCIe bus type. + */ + HSA_AMD_LINK_INFO_TYPE_PCIE = 2, + + /** + * Infiniband bus type. + */ + HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3, + + /** + * xGMI link type. + */ + HSA_AMD_LINK_INFO_TYPE_XGMI = 4 + +} hsa_amd_link_info_type_t; + +/** + * @brief Link properties when accessing the memory pool from the specified + * agent. + */ +typedef struct hsa_amd_memory_pool_link_info_s { + /** + * Minimum transfer latency (rounded to ns). + */ + uint32_t min_latency; + + /** + * Maximum transfer latency (rounded to ns). + */ + uint32_t max_latency; + + /** + * Minimum link interface bandwidth in MB/s. + */ + uint32_t min_bandwidth; + + /** + * Maximum link interface bandwidth in MB/s. + */ + uint32_t max_bandwidth; + + /** + * Support for 32-bit atomic transactions. + */ + bool atomic_support_32bit; + + /** + * Support for 64-bit atomic transactions. + */ + bool atomic_support_64bit; + + /** + * Support for cache coherent transactions. + */ + bool coherent_support; + + /** + * The type of bus/link. + */ + hsa_amd_link_info_type_t link_type; + + /** + * NUMA distance of memory pool relative to querying agent + */ + uint32_t numa_distance; +} hsa_amd_memory_pool_link_info_t; + +/** + * @brief Properties of the relationship between an agent a memory pool. + */ +typedef enum { + /** + * Access to buffers located in the memory pool. The type of this attribute + * is ::hsa_amd_memory_pool_access_t. + * + * An agent can always directly access buffers currently located in a memory + * pool that is associated (the memory_pool is one of the values returned by + * ::hsa_amd_agent_iterate_memory_pools on the agent) with that agent. If the + * buffer is currently located in a memory pool that is not associated with + * the agent, and the value returned by this function for the given + * combination of agent and memory pool is not + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, the application still needs to invoke + * ::hsa_amd_agents_allow_access in order to gain direct access to the buffer. + * + * If the given agent can directly access buffers the pool, the result is not + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is associated with + * the agent, or it is of fined-grained type, the result must not be + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is not associated + * with the agent, and does not reside in the global segment, the result must + * be HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. + */ + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0, + + /** + * Number of links to hop when accessing the memory pool from the specified + * agent. The value of this attribute is zero if the memory pool is associated + * with the agent, or if the access type is + * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. The type of this attribute is + * uint32_t. + */ + HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS = 1, + + /** + * Details of each link hop when accessing the memory pool starting from the + * specified agent. The type of this attribute is an array size of + * HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS with each element containing + * ::hsa_amd_memory_pool_link_info_t. + */ + HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO = 2 + +} hsa_amd_agent_memory_pool_info_t; + +/** + * @brief Get the current value of an attribute of the relationship between an + * agent and a memory pool. + * + * @param[in] agent Agent. + * + * @param[in] memory_pool Memory pool. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + */ +hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info( + hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool, + hsa_amd_agent_memory_pool_info_t attribute, void* value); + +/** + * @brief Enable direct access to a buffer from a given set of agents. + * + * @details + * + * Upon return, only the listed agents and the agent associated with the + * buffer's memory pool have direct access to the @p ptr. + * + * Any agent that has access to the buffer before and after the call to + * ::hsa_amd_agents_allow_access will also have access while + * ::hsa_amd_agents_allow_access is in progress. + * + * The caller is responsible for ensuring that each agent in the list + * must be able to access the memory pool containing @p ptr + * (using ::hsa_amd_agent_memory_pool_get_info with ::HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS attribute), + * otherwise error code is returned. + * + * @param[in] num_agents Size of @p agents. + * + * @param[in] agents List of agents. If @p num_agents is 0, this argument is + * ignored. + * + * @param[in] flags A list of bit-field that is used to specify access + * information in a per-agent basis. This is currently reserved and must be NULL. + * + * @param[in] ptr A buffer previously allocated using ::hsa_amd_memory_pool_allocate. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_agents is 0, or @p agents + * is NULL, @p flags is not NULL, or attempting to enable access to agent(s) + * because @p ptr is allocated from an inaccessible pool. + * + */ +hsa_status_t HSA_API + hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents, + const uint32_t* flags, const void* ptr); + +/** + * @brief Query if buffers currently located in some memory pool can be + * relocated to a destination memory pool. + * + * @details If the returned value is non-zero, a migration of a buffer to @p + * dst_memory_pool using ::hsa_amd_memory_migrate may nevertheless fail due to + * resource limitations. + * + * @param[in] src_memory_pool Source memory pool. + * + * @param[in] dst_memory_pool Destination memory pool. + * + * @param[out] result Pointer to a memory location where the result of the query + * is stored. Must not be NULL. If buffers currently located in @p + * src_memory_pool can be relocated to @p dst_memory_pool, the result is + * true. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL One of the memory pools is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API + hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool, + hsa_amd_memory_pool_t dst_memory_pool, + bool* result); + +/** + * @brief Relocate a buffer to a new memory pool. + * + * @details When a buffer is migrated, its virtual address remains the same but + * its physical contents are moved to the indicated memory pool. + * + * After migration, only the agent associated with the destination pool will have access. + * + * The caller is also responsible for ensuring that the allocation in the + * source memory pool where the buffer is currently located can be migrated to the + * specified destination memory pool (using ::hsa_amd_memory_pool_can_migrate returns a value of true + * for the source and destination memory pools), otherwise behavior is undefined. + * + * The caller must ensure that the buffer is not accessed while it is migrated. + * + * @param[in] ptr Buffer to be relocated. The buffer must have been released to system + * prior to call this API. The buffer will be released to system upon completion. + * + * @param[in] memory_pool Memory pool where to place the buffer. + * + * @param[in] flags A bit-field that is used to specify migration + * information. Must be zero. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The destination memory pool is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p flags is not 0. + */ +hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr, + hsa_amd_memory_pool_t memory_pool, + uint32_t flags); + +/** + * + * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and + * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously + * locked memory, then the overlap area is kept locked (i.e multiple mappings are permitted). In + * this case, the same input @p host_ptr may give different locked @p agent_ptr and when it does, + * they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent). + * Accesses to @p agent_ptr are coarse grained. + * + * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator. + * + * @param[in] size The size to be locked. + * + * @param[in] agents Array of agent handle to gain access to the @p host_ptr. + * If this parameter is NULL and the @p num_agent is 0, all agents + * in the platform will gain access to the @p host_ptr. + * + * @param[out] agent_ptr Pointer to the location where to store the new address. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or + * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents + * is NULL but @p num_agent is not 0. + */ +hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size, + hsa_agent_t* agents, int num_agent, + void** agent_ptr); + +/** + * + * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and + * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously + * locked memory, then the overlap area is kept locked (i.e. multiple mappings are permitted). + * In this case, the same input @p host_ptr may give different locked @p agent_ptr and when it + * does, they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent). + * Acesses to the memory via @p agent_ptr have the same access properties as memory allocated from + * @p pool as determined by ::hsa_amd_memory_pool_get_info and ::hsa_amd_agent_memory_pool_get_info + * (ex. coarse/fine grain, platform atomic support, link info). Physical composition and placement + * of the memory (ex. page size, NUMA binding) is not changed. + * + * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator. + * + * @param[in] size The size to be locked. + * + * @param[in] agents Array of agent handle to gain access to the @p host_ptr. + * If this parameter is NULL and the @p num_agent is 0, all agents + * in the platform will gain access to the @p host_ptr. + * + * @param[in] pool Global memory pool owned by a CPU agent. + * + * @param[in] flags A bit-field that is used to specify allocation + * directives. Reserved parameter, must be 0. + * + * @param[out] agent_ptr Pointer to the location where to store the new address. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in + * allocating the necessary resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is + * invalid or can not access @p pool. + * + * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL @p pool is invalid or not owned + * by a CPU agent. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or + * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents + * is NULL but @p num_agent is not 0 or flags is not 0. + */ +hsa_status_t HSA_API hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_t* agents, + int num_agent, hsa_amd_memory_pool_t pool, + uint32_t flags, void** agent_ptr); + +/** + * + * @brief Unpin the host pointer previously pinned via ::hsa_amd_memory_lock or + * ::hsa_amd_memory_lock_to_pool. + * + * @details The behavior is undefined if the host pointer being unpinned does not + * match previous pinned address or if the host pointer was already deallocated. + * + * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator that was + * pinned previously via ::hsa_amd_memory_lock or ::hsa_amd_memory_lock_to_pool. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + */ +hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr); + +/** + * @brief Sets the first @p count of uint32_t of the block of memory pointed by + * @p ptr to the specified @p value. + * + * @param[in] ptr Pointer to the block of memory to fill. + * + * @param[in] value Value to be set. + * + * @param[in] count Number of uint32_t element to be set to the value. + * + * @retval HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or + * not 4 bytes aligned + * + * @retval HSA_STATUS_ERROR_INVALID_ALLOCATION if the given memory + * region was not allocated with HSA runtime APIs. + * + */ +hsa_status_t HSA_API + hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count); + +/** + * @brief Maps an interop object into the HSA flat address space and establishes + * memory residency. The metadata pointer is valid during the lifetime of the + * map (until hsa_amd_interop_unmap_buffer is called). + * Multiple calls to hsa_amd_interop_map_buffer with the same interop_handle + * result in multiple mappings with potentially different addresses and + * different metadata pointers. Concurrent operations on these addresses are + * not coherent. Memory must be fenced to system scope to ensure consistency, + * between mappings and with any views of this buffer in the originating + * software stack. + * + * @param[in] num_agents Number of agents which require access to the memory + * + * @param[in] agents List of accessing agents. + * + * @param[in] interop_handle Handle of interop buffer (dmabuf handle in Linux) + * + * @param [in] flags Reserved, must be 0 + * + * @param[out] size Size in bytes of the mapped object + * + * @param[out] ptr Base address of the mapped object + * + * @param[out] metadata_size Size of metadata in bytes, may be NULL + * + * @param[out] metadata Pointer to metadata, may be NULL + * + * @retval HSA_STATUS_SUCCESS if successfully mapped + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT all other errors + */ +hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents, + hsa_agent_t* agents, + int interop_handle, + uint32_t flags, + size_t* size, + void** ptr, + size_t* metadata_size, + const void** metadata); + +/** + * @brief Removes a previously mapped interop object from HSA's flat address space. + * Ends lifetime for the mapping's associated metadata pointer. + */ +hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr); + +/** + * @brief Denotes the type of memory in a pointer info query. + */ +typedef enum { + /* + Memory is not known to the HSA driver. Unallocated or unlocked system memory. + */ + HSA_EXT_POINTER_TYPE_UNKNOWN = 0, + /* + Memory was allocated with an HSA memory allocator. + */ + HSA_EXT_POINTER_TYPE_HSA = 1, + /* + System memory which has been locked for use with an HSA agent. + + Memory of this type is normal malloc'd memory and is always accessible to + the CPU. Pointer info queries may not include CPU agents in the accessible + agents list as the CPU has implicit access. + */ + HSA_EXT_POINTER_TYPE_LOCKED = 2, + /* + Memory originated in a graphics component and is shared with ROCr. + */ + HSA_EXT_POINTER_TYPE_GRAPHICS = 3, + /* + Memory has been shared with the local process via ROCr IPC APIs. + */ + HSA_EXT_POINTER_TYPE_IPC = 4, + /* + No backend memory but virtual address + */ + HSA_EXT_POINTER_TYPE_RESERVED_ADDR = 5, + /* + Memory was allocated with an HSA virtual memory allocator + */ + HSA_EXT_POINTER_TYPE_HSA_VMEM = 6 +} hsa_amd_pointer_type_t; + +/** + * @brief Describes a memory allocation known to ROCr. + * Within a ROCr major version this structure can only grow. + */ +typedef struct hsa_amd_pointer_info_s { + /* + Size in bytes of this structure. Used for version control within a major ROCr + revision. Set to sizeof(hsa_amd_pointer_t) prior to calling + hsa_amd_pointer_info. If the runtime supports an older version of pointer + info then size will be smaller on return. Members starting after the return + value of size will not be updated by hsa_amd_pointer_info. + */ + uint32_t size; + /* + The type of allocation referenced. + */ + hsa_amd_pointer_type_t type; + /* + Base address at which non-host agents may access the allocation. This field is + not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + void* agentBaseAddress; + /* + Base address at which the host agent may access the allocation. This field is + not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + void* hostBaseAddress; + /* + Size of the allocation. This field is not meaningful if the type of the allocation + is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + size_t sizeInBytes; + /* + Application provided value. This field is not meaningful if the type of the + allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + void* userData; + /* + Reports an agent which "owns" (ie has preferred access to) the pool in which the + allocation was + made. When multiple agents share equal access to a pool (ex: multiple CPU agents, or multi-die + GPU boards) any such agent may be returned. This field is not meaningful if + the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN or if this agent is not available in + this process, for e.g if this agent is masked using ROCR_VISIBLE_DEVICES. + */ + hsa_agent_t agentOwner; + /* + Contains a bitfield of hsa_amd_memory_pool_global_flag_t values. + Reports the effective global flags bitmask for the allocation. This field is not + meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + uint32_t global_flags; + + /* + Set to true if this allocation was registered with the underlying driver + This field is not meaningful if the type of the allocation is + HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + bool registered; +} hsa_amd_pointer_info_t; + +/** + * @brief Retrieves information about the allocation referenced by the given + * pointer. Optionally returns the number and list of agents which can + * directly access the allocation. In case this virtual address is unknown, the + * pointer type returned will be HSA_EXT_POINTER_TYPE_UNKNOWN and the only fields + * that are valid after hsa_amd_pointer_info returns are size and type. + * + * @param[in] ptr Pointer which references the allocation to retrieve info for. + * + * @param[in, out] info Pointer to structure to be filled with allocation info. + * Data member size must be set to the size of the structure prior to calling + * hsa_amd_pointer_info. On return size will be set to the size of the + * pointer info structure supported by the runtime, if smaller. Members + * beyond the returned value of size will not be updated by the API. + * Must not be NULL. + * + * @param[in] alloc Function pointer to an allocator used to allocate the + * @p accessible array. If NULL @p accessible will not be returned. + * + * @param[out] num_agents_accessible Recieves the count of agents in + * @p accessible. If NULL @p accessible will not be returned. + * + * @param[out] accessible Recieves a pointer to the array, allocated by @p alloc, + * holding the list of agents which may directly access the allocation. + * May be NULL. + * + * @retval HSA_STATUS_SUCCESS Info retrieved successfully + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT NULL in @p ptr or @p info. + */ +hsa_status_t HSA_API hsa_amd_pointer_info(const void* ptr, + hsa_amd_pointer_info_t* info, + void* (*alloc)(size_t), + uint32_t* num_agents_accessible, + hsa_agent_t** accessible); + +/** + * @brief Associates an arbitrary pointer with an allocation known to ROCr. + * The pointer can be fetched by hsa_amd_pointer_info in the userData field. + * + * @param[in] ptr Pointer to the first byte of an allocation known to ROCr + * with which to associate @p userdata. + * + * @param[in] userdata Abitrary pointer to associate with the allocation. + * + * @retval HSA_STATUS_SUCCESS @p userdata successfully stored. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is not known to ROCr. + */ +hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(const void* ptr, + void* userdata); + +/** + * @brief 256-bit process independent identifier for a ROCr shared memory + * allocation. + */ +typedef struct hsa_amd_ipc_memory_s { + uint32_t handle[8]; +} hsa_amd_ipc_memory_t; + +/** + * @brief Prepares an allocation for interprocess sharing and creates a + * handle of type hsa_amd_ipc_memory_t uniquely identifying the allocation. A + * handle is valid while the allocation it references remains accessible in + * any process. In general applications should confirm that a shared memory + * region has been attached (via hsa_amd_ipc_memory_attach) in the remote + * process prior to releasing that memory in the local process. + * Repeated calls for the same allocation may, but are not required to, return + * unique handles. The allocation needs to be on memory on an agent of type + * HSA_DEVICE_TYPE_GPU. + * + * @param[in] ptr Pointer to device memory allocated via ROCr APIs to prepare for + * sharing. + * + * @param[in] len Length in bytes of the allocation to share. + * + * @param[out] handle Process independent identifier referencing the shared + * allocation. + * + * @retval HSA_STATUS_SUCCESS allocation is prepared for interprocess sharing. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr does not point to the + * first byte of an allocation made through ROCr, or len is not the full length + * of the allocation or handle is NULL. + */ +hsa_status_t HSA_API hsa_amd_ipc_memory_create(void* ptr, size_t len, + hsa_amd_ipc_memory_t* handle); + +/** + * @brief Imports shared memory into the local process and makes it accessible + * by the given agents. If a shared memory handle is attached multiple times + * in a process each attach may return a different address. Each returned + * address is refcounted and requires a matching number of calls to + * hsa_amd_ipc_memory_detach to release the shared memory mapping. + * + * @param[in] handle Pointer to the identifier for the shared memory. + * + * @param[in] len Length of the shared memory to import. + * Reserved. Must be the full length of the shared allocation in this version. + * + * @param[in] num_agents Count of agents in @p mapping_agents. + * May be zero if all agents are to be allowed access. + * + * @param[in] mapping_agents List of agents to access the shared memory. + * Ignored if @p num_agents is zero. + * + * @param[out] mapped_ptr Recieves a process local pointer to the shared memory. + * + * @retval HSA_STATUS_SUCCESS if memory is successfully imported. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid, @p len is + * incorrect, @p mapped_ptr is NULL, or some agent for which access was + * requested can not access the shared memory. + */ +hsa_status_t HSA_API hsa_amd_ipc_memory_attach( + const hsa_amd_ipc_memory_t* handle, size_t len, + uint32_t num_agents, + const hsa_agent_t* mapping_agents, + void** mapped_ptr); + +/** + * @brief Decrements the reference count for the shared memory mapping and + * releases access to shared memory imported with hsa_amd_ipc_memory_attach. + * + * @param[in] mapped_ptr Pointer to the first byte of a shared allocation + * imported with hsa_amd_ipc_memory_attach. + * + * @retval HSA_STATUS_SUCCESS if @p mapped_ptr was imported with + * hsa_amd_ipc_memory_attach. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p mapped_ptr was not imported + * with hsa_amd_ipc_memory_attach. + */ +hsa_status_t HSA_API hsa_amd_ipc_memory_detach(void* mapped_ptr); + +/** @} */ + +/** \addtogroup status Runtime notifications + * @{ + */ + +/** + * @brief 256-bit process independent identifier for a ROCr IPC signal. + */ +typedef hsa_amd_ipc_memory_t hsa_amd_ipc_signal_t; + +/** + * @brief Obtains an interprocess sharing handle for a signal. The handle is + * valid while the signal it references remains valid in any process. In + * general applications should confirm that the signal has been attached (via + * hsa_amd_ipc_signal_attach) in the remote process prior to destroying that + * signal in the local process. + * Repeated calls for the same signal may, but are not required to, return + * unique handles. + * + * @param[in] signal Signal created with attribute HSA_AMD_SIGNAL_IPC. + * + * @param[out] handle Process independent identifier referencing the shared + * signal. + * + * @retval HSA_STATUS_SUCCESS @p handle is ready to use for interprocess sharing. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is not a valid signal + * created with attribute HSA_AMD_SIGNAL_IPC or handle is NULL. + */ +hsa_status_t HSA_API hsa_amd_ipc_signal_create(hsa_signal_t signal, hsa_amd_ipc_signal_t* handle); + +/** + * @brief Imports an IPC capable signal into the local process. If an IPC + * signal handle is attached multiple times in a process each attach may return + * a different signal handle. Each returned signal handle is refcounted and + * requires a matching number of calls to hsa_signal_destroy to release the + * shared signal. + * + * @param[in] handle Pointer to the identifier for the shared signal. + * + * @param[out] signal Recieves a process local signal handle to the shared signal. + * + * @retval HSA_STATUS_SUCCESS if the signal is successfully imported. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized + * + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid. + */ +hsa_status_t HSA_API hsa_amd_ipc_signal_attach(const hsa_amd_ipc_signal_t* handle, + hsa_signal_t* signal); + +/** + * @brief GPU system event type. + */ +typedef enum hsa_amd_event_type_s { + /* + AMD GPU memory fault. + */ + HSA_AMD_GPU_MEMORY_FAULT_EVENT = 0, + /* + AMD GPU HW Exception. + */ + HSA_AMD_GPU_HW_EXCEPTION_EVENT, + /* + AMD GPU memory error. + */ + HSA_AMD_GPU_MEMORY_ERROR_EVENT, +} hsa_amd_event_type_t; + +/** + * @brief Flags denoting the cause of a memory fault. + */ +typedef enum { + // Page not present or supervisor privilege. + HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT = 1 << 0, + // Write access to a read-only page. + HSA_AMD_MEMORY_FAULT_READ_ONLY = 1 << 1, + // Execute access to a page marked NX. + HSA_AMD_MEMORY_FAULT_NX = 1 << 2, + // GPU attempted access to a host only page. + HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3, + // DRAM ECC failure. + HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4, + // Can't determine the exact fault address. + HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5, + // SRAM ECC failure (ie registers, no fault address). + HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6, + // GPU reset following unspecified hang. + HSA_AMD_MEMORY_FAULT_HANG = 1U << 31 +} hsa_amd_memory_fault_reason_t; + +/** + * @brief AMD GPU memory fault event data. + */ +typedef struct hsa_amd_gpu_memory_fault_info_s { + /* + The agent where the memory fault occurred. + */ + hsa_agent_t agent; + /* + Virtual address accessed. + */ + uint64_t virtual_address; + /* + Bit field encoding the memory access failure reasons. There could be multiple bits set + for one fault. Bits are defined in hsa_amd_memory_fault_reason_t. + */ + uint32_t fault_reason_mask; +} hsa_amd_gpu_memory_fault_info_t; + +/** + * @brief Flags denoting the cause of a memory error. + */ +typedef enum { + // Memory was in use by low-level HW component and cannot be released + HSA_AMD_MEMORY_ERROR_MEMORY_IN_USE = (1 << 0), +} hsa_amd_memory_error_reason_t; + +/** + * @brief AMD GPU memory error event data. + */ +typedef struct hsa_amd_gpu_memory_error_info_s { + /* + The agent where the memory error occurred. + */ + hsa_agent_t agent; + /* + Virtual address involved. + */ + uint64_t virtual_address; + /* + Bit field encoding the memory error failure reasons. There could be multiple bits set + for one error. Bits are defined in hsa_amd_memory_error_reason_t. + */ + uint32_t error_reason_mask; +} hsa_amd_gpu_memory_error_info_t; + +/** + * @brief Flags denoting the type of a HW exception + */ +typedef enum { + // Unused for now + HSA_AMD_HW_EXCEPTION_RESET_TYPE_OTHER = 1 << 0, +} hsa_amd_hw_exception_reset_type_t; + +/** + * @brief Flags denoting the cause of a HW exception + */ +typedef enum { + // GPU Hang + HSA_AMD_HW_EXCEPTION_CAUSE_GPU_HANG = 1 << 0, + // SRAM ECC + HSA_AMD_HW_EXCEPTION_CAUSE_ECC = 1 << 1, +} hsa_amd_hw_exception_reset_cause_t; + +/** + * @brief AMD GPU HW Exception event data. + */ +typedef struct hsa_amd_gpu_hw_exception_info_s { + /* + The agent where the HW exception occurred. + */ + hsa_agent_t agent; + hsa_amd_hw_exception_reset_type_t reset_type; + hsa_amd_hw_exception_reset_cause_t reset_cause; +} hsa_amd_gpu_hw_exception_info_t; + +/** + * @brief AMD GPU event data passed to event handler. + */ +typedef struct hsa_amd_event_s { + /* + The event type. + */ + hsa_amd_event_type_t event_type; + union { + /* + The memory fault info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_FAULT_EVENT. + */ + hsa_amd_gpu_memory_fault_info_t memory_fault; + /* + The memory fault info, only valid when @p event_type is HSA_AMD_GPU_HW_EXCEPTION_EVENT. + */ + hsa_amd_gpu_hw_exception_info_t hw_exception; + /* + The memory error info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_ERROR_EVENT. + */ + hsa_amd_gpu_memory_error_info_t memory_error; + }; +} hsa_amd_event_t; + +typedef hsa_status_t (*hsa_amd_system_event_callback_t)(const hsa_amd_event_t* event, void* data); + +/** + * @brief Register AMD GPU event handler. + * + * @param[in] callback Callback to be invoked when an event is triggered. + * The HSA runtime passes two arguments to the callback: @p event + * is defined per event by the HSA runtime, and @p data is the user data. + * + * @param[in] data User data that is passed to @p callback. May be NULL. + * + * @retval HSA_STATUS_SUCCESS The handler has been registered successfully. + * + * @retval HSA_STATUS_ERROR An event handler has already been registered. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p event is invalid. + */ +hsa_status_t HSA_API hsa_amd_register_system_event_handler(hsa_amd_system_event_callback_t callback, + void* data); + +/** @} */ + +/** \addtogroup queue Queues + * @{ + */ + +/** + * @brief Per-queue dispatch and wavefront scheduling priority. + */ +typedef enum hsa_amd_queue_priority_s { + /* + Below normal/high priority compute and all graphics + */ + HSA_AMD_QUEUE_PRIORITY_LOW = 0, + /* + Above low priority compute, below high priority compute and all graphics + */ + HSA_AMD_QUEUE_PRIORITY_NORMAL = 1, + /* + Above low/normal priority compute and all graphics + */ + HSA_AMD_QUEUE_PRIORITY_HIGH = 2, +} hsa_amd_queue_priority_t; + +/** + * @brief Modifies the dispatch and wavefront scheduling prioirty for a + * given compute queue. The default is HSA_AMD_QUEUE_PRIORITY_NORMAL. + * + * @param[in] queue Compute queue to apply new priority to. + * + * @param[in] priority Priority to associate with queue. + * + * @retval HSA_STATUS_SUCCESS if priority was changed successfully. + * + * @retval HSA_STATUS_ERROR_INVALID_QUEUE if queue is not a valid + * compute queue handle. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if priority is not a valid + * value from hsa_amd_queue_priority_t. + */ +hsa_status_t HSA_API hsa_amd_queue_set_priority(hsa_queue_t* queue, + hsa_amd_queue_priority_t priority); + +/** + * @brief Queue creation attributes. + */ +typedef enum { + /** + * The queue's packet buffer and queue descriptor struct should be + * allocated in system memory (default). Mutually exclusive with + * HSA_AMD_QUEUE_CREATE_DEVICE_MEM_RING_BUF and + * HSA_AMD_QUEUE_CREATE_DEVICE_MEM_QUEUE_DESCRIPTOR. + */ + HSA_AMD_QUEUE_CREATE_SYSTEM_MEM = 0, + /** + * The queue's packet buffer should be allocated in the agent's + * fine-grain device memory region. + */ + HSA_AMD_QUEUE_CREATE_DEVICE_MEM_RING_BUF = (1 << 0), + /** + * The queue desciptor struct should be allocated in the agent's + * fine-grain device memory region. Not supported for devices + * connected via PCIe because the CPU's atomic read-modify-write + * operations cannot be promoted to PCIe atomic read-modify-write + * operations. + */ + HSA_AMD_QUEUE_CREATE_DEVICE_MEM_QUEUE_DESCRIPTOR = (1 << 1), +} hsa_amd_queue_create_flag_t; + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Deallocation notifier function type. + */ +typedef void (*hsa_amd_deallocation_callback_t)(void* ptr, void* user_data); + +/** + * @brief Registers a deallocation notifier monitoring for release of agent + * accessible address @p ptr. If successful, @p callback will be invoked when + * @p ptr is removed from accessibility from all agents. + * + * Notification callbacks are automatically deregistered when they are invoked. + * + * Note: The current version supports notifications of address release + * originating from ::hsa_amd_memory_pool_free. Support for other address + * release APIs will follow. + * + * @param[in] ptr Agent accessible address to monitor for deallocation. Passed + * to @p callback. + * + * @param[in] callback Notifier to be invoked when @p ptr is released from + * agent accessibility. + * + * @param[in] user_data User provided value passed to @p callback. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The notifier registered successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p ptr does not refer to a valid agent accessible + * address. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL or @p ptr is NULL. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating + * necessary resources + */ +hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr, + hsa_amd_deallocation_callback_t callback, + void* user_data); + +/** + * @brief Removes a deallocation notifier previously registered with + * ::hsa_amd_register_deallocation_callback. Arguments must be identical to + * those given in ::hsa_amd_register_deallocation_callback. + * + * @param[in] ptr Agent accessible address which was monitored for deallocation. + * + * @param[in] callback Notifier to be removed. + * + * @retval ::HSA_STATUS_SUCCESS The notifier has been removed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The given notifier was not registered. + */ +hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr, + hsa_amd_deallocation_callback_t callback); + +typedef enum hsa_amd_svm_model_s { + /** + * Updates to memory with this attribute conform to HSA memory consistency + * model. + */ + HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0, + /** + * Writes to memory with this attribute can be performed by a single agent + * at a time. + */ + HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1, + /** + * Memory region queried contains subregions with both + * HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED and + * HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED attributes. + * + * This attribute can not be used in hsa_amd_svm_attributes_set. It is a + * possible return from hsa_amd_svm_attributes_get indicating that the query + * region contains both coarse and fine grained memory. + */ + HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE = 2 +} hsa_amd_svm_model_t; + +typedef enum hsa_amd_svm_attribute_s { + // Memory model attribute. + // Type of this attribute is hsa_amd_svm_model_t. + HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0, + // Marks the range read only. This allows multiple physical copies to be + // placed local to each accessing device. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_ONLY = 1, + // Automatic migrations should attempt to keep the memory within the xgmi hive + // containing accessible agents. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2, + // Page granularity to migrate at once. Page granularity is specified as + // log2(page_count). + // Type of this attribute is uint64_t. + HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3, + // Physical location to prefer when automatic migration occurs. + // Set to the null agent handle (handle == 0) to indicate there + // is no preferred location. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4, + // This attribute can not be used in ::hsa_amd_svm_attributes_set (see + // ::hsa_amd_svm_prefetch_async). + // Queries the physical location of most recent prefetch command. + // If the prefetch location has not been set or is not uniform across the + // address range then returned hsa_agent_t::handle will be 0. + // Querying this attribute will return the destination agent of the most + // recent ::hsa_amd_svm_prefetch_async targeting the address range. If + // multiple async prefetches have been issued targeting the region and the + // most recently issued prefetch has completed then the query will return + // the location of the most recently completed prefetch. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5, + // Optimizes with the anticipation that the majority of operations to the + // range will be read operations. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_MOSTLY = 6, + // Allows the execution on GPU. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_GPU_EXEC = 7, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range. Access may incur a page fault + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM + // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range without page faults. Access + // will not incur a page fault and will not cause access based migration. + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if + // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Denies an agent access to the memory range. Access will cause a terminal + // segfault. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202, + // This attribute can not be used in ::hsa_amd_svm_attributes_set. + // Returns the access attribute associated with the agent. + // The agent to query must be set in the attribute value field. + // The attribute enum will be replaced with the agent's current access + // attribute for the address range. + // TODO: Clarify KFD return value for non-uniform access attribute. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203, +} hsa_amd_svm_attribute_t; + +// List type for hsa_amd_svm_attributes_set/get. +typedef struct hsa_amd_svm_attribute_pair_s { + // hsa_amd_svm_attribute_t value. + uint64_t attribute; + // Attribute value. Bit values should be interpreted according to the type + // given in the associated attribute description. + uint64_t value; +} hsa_amd_svm_attribute_pair_t; + +/** + * @brief Sets SVM memory attributes. + * + * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling + * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE + * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM + * memory access by that Agent. + * + * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION + * may not be used with this API. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Gets SVM memory attributes. + * + * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE, + * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and + * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API. + * + * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an + * hsa_agent_t and returns the current access type through its attribute field. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Asynchronously migrates memory to an agent. + * + * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero. + * @p completion_signal will decrement when the migration is complete. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] agent Agent to migrate to. + * + * @param[in] num_dep_signals Number of dependent signals. Can be 0. + * + * @param[in] dep_signals List of signals that must be waited on before the migration + * operation starts. The migration will start after every signal has been observed with + * the value 0. If @p num_dep_signals is 0, this argument is ignored. + * + * @param[in] completion_signal Signal used to indicate completion of the migration + * operation. When the migration operation is finished, the value of the signal is + * decremented. The runtime indicates that an error has occurred during the copy + * operation by setting the value of the completion signal to a negative + * number. If no completion signal is required this handle may be null. + */ +hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + +/** @} */ + +/** \addtogroup profile Profiling + * @{ + */ + +/** + * @brief Acquire Stream Performance Monitor on an agent + * + * Acquire exclusive use of SPM on @p preferred_agent. + * See hsa_amd_spm_set_dest_buffer to provide a destination buffer to KFD to start recording and + * retrieve this data. + * @param[in] preferred_agent Agent on which to acquire SPM + */ +hsa_status_t hsa_amd_spm_acquire(hsa_agent_t preferred_agent); + +/** + * @brief Release Stream Performance Monitor on an agent + * + * Release exclusive use of SPM on @p preferred_agent. This will stop KFD writing SPM data. + * If a destination buffer is set, then data in the destination buffer is available to user + * when this function returns. + * + * @param[in] preferred_agent Agent on which to release SPM + */ +hsa_status_t hsa_amd_spm_release(hsa_agent_t preferred_agent); + +/** + * @brief Set up the current destination user mode buffer for stream performance + * counter data. KFD will start writing SPM data into the destination buffer. KFD will continue + * to copy data into the current destination buffer until any of the following functions are called + * - hsa_amd_spm_release + * - hsa_amd_spm_set_dest_buffer with dest set to NULL + * - hsa_amd_spm_set_dest_buffer with dest set to a new buffer + * + * if @p timeout is non-0, the call will wait for up to @p timeout ms for the previous + * buffer to be filled. If previous buffer to be filled before timeout, the @p timeout + * will be updated value with the time remaining. If the timeout is exceeded, the function + * copies any partial data available into the previous user buffer and returns success. + * User should not access destination data while KFD is copying data. + * If the previous destination buffer was full, then @p is_data_loss flag is set. + * @p dest is CPU accessible memory. It could be malloc'ed memory or host allocated memory + * + * @param[in] preferred_agent Agent on which to set the dest buffer + * + * @param[in] size_in_bytes size of the buffer + * + * @param[in,out] timeout timeout in milliseconds + * + * @param[out] size_copied number of bytes copied + * + * @param[in] dest destination address. Set to NULL to stop copy on previous buffer + * + * @param[out] is_data_loss true is data was lost + */ +hsa_status_t hsa_amd_spm_set_dest_buffer(hsa_agent_t preferred_agent, size_t size_in_bytes, + uint32_t* timeout, uint32_t* size_copied, void* dest, + bool* is_data_loss); + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Older version of export dmabuf + * + * This is the same as calling the v2 version of export dmabuf with the + * flags argument set to HSA_AMD_DMABUF_MAPPING_TYPE_NONE. + * + * @param[in] ptr Pointer to the allocation being exported. + * + * @param[in] size Size in bytes to export following @p ptr. The entire range + * being exported must be contained within a single allocation. + * + * @param[out] dmabuf Pointer to a dma-buf file descriptor holding a reference to the + * allocation. Contents will not be altered in the event of failure. + * + * @param[out] offset Offset in bytes into the memory referenced by the dma-buf + * object at which @p ptr resides. Contents will not be altered in the event + * of failure. + * + * @retval ::HSA_STATUS_SUCCESS Export completed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT One or more arguments is NULL. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The address range described by + * @p ptr and @p size are not contained within a single allocation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The allocation described by @p ptr + * and @p size was allocated on a device which can not export memory. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The return file descriptor, + * @p dmabuf, could not be created. + */ +hsa_status_t hsa_amd_portable_export_dmabuf(const void* ptr, size_t size, int* dmabuf, + uint64_t* offset); + + /** + * @brief Obtains an OS specific, vendor neutral, handle to a memory allocation. + * + * Obtains an OS specific handle to GPU agent memory. The memory must be part + * of a single allocation from an hsa_amd_memory_pool_t exposed by a GPU Agent. + * The handle may be used with other APIs (e.g. Vulkan) to obtain shared access + * to the allocation. + * + * Shared access to the memory is not guaranteed to be fine grain coherent even + * if the allocation exported is from a fine grain pool. The shared memory + * consistency model will be no stronger than the model exported from, consult + * the importing API to determine the final consistency model. + * + * The allocation's memory remains valid as long as the handle and any mapping + * of the handle remains valid. When the handle and all mappings are closed + * the backing memory will be released for reuse. + * + * @param[in] ptr Pointer to the allocation being exported. + * + * @param[in] size Size in bytes to export following @p ptr. The entire range + * being exported must be contained within a single allocation. + * + * @param[out] dmabuf Pointer to a dma-buf file descriptor holding a reference to the + * allocation. Contents will not be altered in the event of failure. + * + * @param[out] offset Offset in bytes into the memory referenced by the dma-buf + * object at which @p ptr resides. Contents will not be altered in the event + * of failure. + * + * @param[in] flags Bitmask of hsa_amd_dma_buf_mapping_type_t flags. + * + * @retval ::HSA_STATUS_SUCCESS Export completed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT One or more arguments is NULL. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The address range described by + * @p ptr and @p size are not contained within a single allocation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The allocation described by @p ptr + * and @p size was allocated on a device which can not export memory. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The return file descriptor, + * @p dmabuf, could not be created. + */ +hsa_status_t hsa_amd_portable_export_dmabuf_v2(const void* ptr, size_t size, + int* dmabuf, uint64_t* offset, uint64_t flags); + +/** + * @brief Closes an OS specific, vendor neutral, handle to a memory allocation. + * + * Closes an OS specific handle to GPU agent memory. + * + * Applications should close a handle after imports are complete. The handle + * is not required to remain open for the lifetime of imported mappings. The + * referenced allocation will remain valid until all handles and mappings + * are closed. + * + * @param[in] dmabuf Handle to be closed. + * + * @retval ::HSA_STATUS_SUCCESS Handle closed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_RESOURCE_FREE A generic error was encountered + * when closing the handle. The handle may have been closed already or an + * async IO error may have occured. + */ +hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf); + +typedef enum hsa_amd_vmem_address_reserve_flag_s { + // Only reserve a VA range without registering it to the underlying driver + HSA_AMD_VMEM_ADDRESS_NO_REGISTER = (1UL << 0), +} hsa_amd_vmem_address_reserve_flag_t; + +/** + * @brief Allocate a reserved address range + * + * Reserve a virtual address range. The size must be a multiple of the system page size. + * If it is not possible to allocate the address specified by @p address, then @p va will be + * a different address range. + * Address range should be released by calling hsa_amd_vmem_address_free. + * + * @param[out] va virtual address allocated + * @param[in] size of address range requested + * @param[in] address requested + * @param[in] flags optional hsa_amd_vmem_address_reserve_flag_t + * + * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address + * range of this size. + * + * Note that this API will be deprecated in a future release and replaced by + * hsa_amd_vmem_address_reserve_align + */ +hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t address, + uint64_t flags); + +/** + * @brief Allocate a reserved address range + * + * Reserve a virtual address range. The size must be a multiple of the system page size. + * If it is not possible to allocate the address specified by @p address, then @p va will be + * a different address range. + * Address range should be released by calling hsa_amd_vmem_address_free. + * + * @param[out] va virtual address allocated + * @param[in] size of address range requested + * @param[in] address requested + * @param[in] alignment requested. 0 for default. Must be >= page-size and a power of 2 + * @param[in] flags optional hsa_amd_vmem_address_reserve_flag_t + * + * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address + * range of this size. + */ +hsa_status_t hsa_amd_vmem_address_reserve_align(void** va, size_t size, uint64_t address, + uint64_t alignment, uint64_t flags); + +/** + * @brief Free a reserved address range + * + * Free a previously allocated address range. The size must match the size of a previously + * allocated address range. + * + * @param[out] va virtual address to be freed + * @param[in] size of address range + * + * @retval ::HSA_STATUS_SUCCESS Address range released successfully + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid va specified + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid size specified + * @retval ::HSA_STATUS_ERROR_RESOURCE_FREE Address range is still in use + * @retval ::HSA_STATUS_ERROR Internal unexpected error + */ +hsa_status_t hsa_amd_vmem_address_free(void* va, size_t size); + +/** + * @brief Struct containing an opaque handle to a memory allocation handle + */ +typedef struct hsa_amd_vmem_alloc_handle_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_amd_vmem_alloc_handle_t; + +typedef enum { + MEMORY_TYPE_NONE, + MEMORY_TYPE_PINNED, +} hsa_amd_memory_type_t; + +/** + * @brief Create a virtual memory handle + * + * Create a virtual memory handle within this pool + * @p size must be a aligned to allocation granule size for this memory pool, see + * HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE + * To minimize internal memory fragmentation, align the size to the recommended allocation granule + * size, see HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE + * + * @param[in] pool memory to use + * @param[in] size of the memory allocation + * @param[in] type of memory + * @param[in] flags - currently unsupported + * @param[out] memory_handle - handle for the allocation + * + * @retval ::HSA_STATUS_SUCCESS memory allocated successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid arguments + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION This memory pool does not support allocations + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate this memory + */ +hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size, + hsa_amd_memory_type_t type, uint64_t flags, + hsa_amd_vmem_alloc_handle_t* memory_handle); + +/** + * @brief Release a virtual memory handle + * + * @param[in] memory handle that was previously allocated + * + * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle + */ +hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_handle); + +/** + * @brief Map a virtual memory handle + * + * Map a virtual memory handle to a reserved address range. The virtual address requested must be + * within a previously reserved address range. @p va and (@p va + size) must be must be within + * (va + size) of the previous allocated address range. + * @p size must be equal to size of the @p memory_handle + * hsa_amd_vmem_set_access needs to be called to make the memory accessible to specific agents + * + * @param[in] va virtual address range where memory will be mapped + * @param[in] size of memory mapping + * @param[in] in_offset offset into memory. Currently unsupported + * @param[in] memory_handle virtual memory handle to be mapped + * @param[in] flags. Currently unsupported + * + * @retval ::HSA_STATUS_SUCCESS Memory mapped successfully + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT va, size or memory_handle are invalid + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_map(void* va, size_t size, size_t in_offset, + hsa_amd_vmem_alloc_handle_t memory_handle, uint64_t flags); + +/** + * @brief Unmap a virtual memory handle + * + * Unmap previously mapped virtual address range + * + * @param[in] va virtual address range where memory will be mapped + * @param[in] size of memory mapping + * + * @retval ::HSA_STATUS_SUCCESS Memory backing unmapped successfully + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION memory_handle is invalid + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT size is invalid + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_unmap(void* va, size_t size); + +typedef struct hsa_amd_memory_access_desc_s { + hsa_access_permission_t permissions; + hsa_agent_t agent_handle; +} hsa_amd_memory_access_desc_t; + +/** + * @brief Make a memory mapping accessible + * + * Make previously mapped virtual address accessible to specific agents. @p size must be equal to + * size of previously mapped virtual memory handle. + * Calling hsa_amd_vmem_set_access multiple times on the same @p va: + * - Will overwrite permissions for agents specified in @p desc + * - Will leave permissions unchanged for agents not specified in @p desc + * + * @param[in] va previously mapped virtual address + * @param[in] size of memory mapping + * @param[in] desc list of access permissions for each agent + * @param[in] desc_cnt number of elements in desc + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT va, size or memory_handle are invalid + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION memory_handle is invalid + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT Invalid agent in desc + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size, + const hsa_amd_memory_access_desc_t* desc, + size_t desc_cnt); + +/** + * @brief Get current access permissions for memory mapping + * + * Get access permissions for memory mapping for specific agent. + * + * @param[in] va previously mapped virtual address + * @param[in] perms current permissions + * @param[in] agent_handle agent + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT Invalid agent + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION va is not mapped or permissions never set for this + * agent + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms, + hsa_agent_t agent_handle); + +/** + * @brief Get an exportable shareable handle + * + * Get an exportable shareable handle for a memory_handle. This shareabl handle can then be used to + * re-create a virtual memory handle using hsa_amd_vmem_import_shareable_handle. The shareable + * handle can be transferred using mechanisms that support posix file descriptors Once all shareable + * handles are closed, the memory_handle is released. + * + * @param[out] dmabuf_fd shareable handle + * @param[in] handle previously allocated virtual memory handle + * @param[in] flags Currently unsupported + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Out of resources + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd, + hsa_amd_vmem_alloc_handle_t handle, + uint64_t flags); +/** + * @brief Import a shareable handle + * + * Import a shareable handle for a memory handle. Importing a shareable handle that has been closed + * and released results in undefined behavior. + * + * @param[in] dmabuf_fd shareable handle exported with hsa_amd_vmem_export_shareable_handle + * @param[out] handle virtual memory handle + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory handle + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Out of resources + * + * @retval ::HSA_STATUS_ERROR Unexpected internal error + */ +hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd, + hsa_amd_vmem_alloc_handle_t* handle); + +/** + * @brief Returns memory handle for mapped memory + * + * Return a memory handle for previously mapped memory. The handle will be the same value of handle + * used to map the memory. The returned handle must be released with corresponding number of calls + * to hsa_amd_vmem_handle_release. + * + * @param[out] memory_handle memory handle for this mapped address + * @param[in] mapped address + * + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid address + */ +hsa_status_t hsa_amd_vmem_retain_alloc_handle(hsa_amd_vmem_alloc_handle_t* memory_handle, + void* addr); + +/** + * @brief Returns the current allocation properties of a handle + * + * Returns the allocation properties of an existing handle + * + * @param[in] memory_handle memory handle to be queried + * @param[out] pool memory pool that owns this handle + * @param[out] memory type + + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle + */ +hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle( + hsa_amd_vmem_alloc_handle_t memory_handle, hsa_amd_memory_pool_t* pool, + hsa_amd_memory_type_t* type); + +/** @} */ + +/** \addtogroup queue Queues + * @{ + */ + +/** + * @brief Set the asynchronous scratch limit threshold on all the queues for this agent. + * Dispatches that are enqueued on HW queues on this agent that are smaller than threshold will not + * result in a scratch use-once method. + * + * Increasing this threshold will only increase the internal limit and not cause immediate allocation + * of additional scratch memory. Decreasing this threshold will result in a release in scratch memory + * on queues where the current amount of allocated scratch exceeds the new limit. + * + * If this API call would result in a release in scratch memory and there are dispatches that are + * currently using scratch memory on this agent, this will result into a blocking call until the + * current dispatches are completed. + * + * This API is only supported on devices that support asynchronous scratch reclaim. + * + * @param[in] agent A valid agent. + * + * @param[in] threshold Threshold size in bytes + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT This agent does not support asynchronous scratch + * reclaim + */ +hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, size_t threshold); + +typedef enum { + /* + * Returns the agent that owns the underlying HW queue. + * The type of this attribute is hsa_agent_t. + */ + HSA_AMD_QUEUE_INFO_AGENT, + /* + * Returns the doorbell ID of the completion signal of the queue + * The type of this attribute is uint64_t. + */ + HSA_AMD_QUEUE_INFO_DOORBELL_ID, +} hsa_queue_info_attribute_t; + +hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute, + void* value); + +typedef struct hsa_amd_ais_file_handle_s { + /* + * file handle for AIS read & write. Linux will use fd. + * pad is keep the size consistent accross different platforms. + */ + union { + void* handle; + int fd; + uint8_t pad[8]; + }; +} hsa_amd_ais_file_handle_t; + +/** + * @brief Write data from device memory to a file + * + * Writes data from device memory buffer to a file at the specified offset. + * The device memory pointer must be accessible from the host and point to + * a valid allocation. + * + * EXPERIMENTAL: AIS read and write calls are currently in experimental phase and + * APIs may be modified + * + * @param[in] handle Handle of the file to write to. + * + * @param[in] devicePtr Device memory buffer pointer containing data to write. + * + * @param[in] size Size in bytes of the data to write. + * + * @param[in] file_offset Offset in bytes into the file where data will be written. + * + * @param[in/out] size_copied Actual number of bytes copied + * + * @param[in/out] status Additional status if any + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fd is invalid, @p devicePtr + * is NULL, or @p size is 0. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p devicePtr does not refer to + * a valid allocation. + * + * @retval ::HSA_STATUS_ERROR An error occurred during the write operation. + */ +hsa_status_t HSA_API hsa_amd_ais_file_write(hsa_amd_ais_file_handle_t handle, void *devicePtr, + uint64_t size, int64_t file_offset, + uint64_t *size_copied, int32_t *status); + +/** + * @brief Read data from a file to device memory + * + * Reads data from a file at the specified offset into a device memory buffer. + * The device memory pointer must be accessible from the host and point to + * a valid allocation. + * + * EXPERIMENTAL: AIS read and write calls are currently in experimental phase and + * APIs may be modified + * @param[in] hanlde Handle of the file to read from. + * + * @param[in] devicePtr Device memory buffer pointer to store the read data. + * + * @param[in] size Size in bytes of the data to read. + * + * @param[in] file_offset Offset in bytes into the file where data will be read from. + * + * @param[in/out] size_copied Actual number of bytes copied + * + * @param[in/out] status Additional status if any + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fd is invalid, @p devicePtr + * is NULL, or @p size is 0. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p devicePtr does not refer to + * a valid allocation. + * + * @retval ::HSA_STATUS_ERROR An error occurred during the read operation. + */ +hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, void *devicePtr, + uint64_t size, int64_t file_offset, + uint64_t *size_copied, int32_t *status); + +/** + * @brief logging types + */ +typedef enum hsa_amd_log_flag_s { + /* Log AQL packets internally enqueued by ROCr */ + HSA_AMD_LOG_FLAG_BLIT_KERNEL_PKTS = 0, + HSA_AMD_LOG_FLAG_AQL = 0, + /* Log SDMA packets */ + HSA_AMD_LOG_FLAG_SDMA = 1, + /* Log INFO */ + HSA_AMD_LOG_FLAG_INFO = 2, +} hsa_amd_log_flag_t; + +/** + * @brief Enable logging via external file + * If this function is called multiple times, the last call to this function will overwrite the + * previous @p flags and @p file. + * + * @param[in] flags is used to filter types of logging. Type is uint8_t[8]. + * Can be set using the hsa_flag_set64 macro. Setting @p flags to 0 will disable logging. + * @param[in] file file stream to output logging. If file is NULL, prints are sent to stderr. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + */ +hsa_status_t hsa_amd_enable_logging(uint8_t* flags, void* file); + +/** @} */ + +#ifdef __cplusplus +} // end extern "C" block +#endif + +#endif // header guard diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_finalize.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_finalize.h new file mode 100644 index 0000000000..94c4582055 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_finalize.h @@ -0,0 +1,531 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ +#define HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ + +#include "hsa.h" + +#undef HSA_API +#ifdef HSA_EXPORT_FINALIZER +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +struct BrigModuleHeader; +typedef struct BrigModuleHeader* BrigModule_t; + +/** \defgroup ext-alt-finalizer-extensions Finalization Extensions + * @{ + */ + +/** + * @brief Enumeration constants added to ::hsa_status_t by this extension. + */ +enum { + /** + * The HSAIL program is invalid. + */ + HSA_EXT_STATUS_ERROR_INVALID_PROGRAM = 0x2000, + /** + * The HSAIL module is invalid. + */ + HSA_EXT_STATUS_ERROR_INVALID_MODULE = 0x2001, + /** + * Machine model or profile of the HSAIL module do not match the machine model + * or profile of the HSAIL program. + */ + HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE = 0x2002, + /** + * The HSAIL module is already a part of the HSAIL program. + */ + HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED = 0x2003, + /** + * Compatibility mismatch between symbol declaration and symbol definition. + */ + HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH = 0x2004, + /** + * The finalization encountered an error while finalizing a kernel or + * indirect function. + */ + HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED = 0x2005, + /** + * Mismatch between a directive in the control directive structure and in + * the HSAIL kernel. + */ + HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH = 0x2006 +}; + +/** @} */ + +/** \defgroup ext-alt-finalizer-program Finalization Program + * @{ + */ + +/** + * @brief HSAIL (BRIG) module. The HSA Programmer's Reference Manual contains + * the definition of the BrigModule_t type. + */ +typedef BrigModule_t hsa_ext_module_t; + +/** + * @brief An opaque handle to a HSAIL program, which groups a set of HSAIL + * modules that collectively define functions and variables used by kernels and + * indirect functions. + */ +typedef struct hsa_ext_program_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_ext_program_t; + +/** + * @brief Create an empty HSAIL program. + * + * @param[in] machine_model Machine model used in the HSAIL program. + * + * @param[in] profile Profile used in the HSAIL program. + * + * @param[in] default_float_rounding_mode Default float rounding mode used in + * the HSAIL program. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[out] program Memory location where the HSA runtime stores the newly + * created HSAIL program handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p machine_model is invalid, + * @p profile is invalid, @p default_float_rounding_mode is invalid, or + * @p program is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_create( + hsa_machine_model_t machine_model, + hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, + hsa_ext_program_t *program); + +/** + * @brief Destroy a HSAIL program. + * + * @details The HSAIL program handle becomes invalid after it has been + * destroyed. Code object handles produced by ::hsa_ext_program_finalize are + * still valid after the HSAIL program has been destroyed, and can be used as + * intended. Resources allocated outside and associated with the HSAIL program + * (such as HSAIL modules that are added to the HSAIL program) can be released + * after the finalization program has been destroyed. + * + * @param[in] program HSAIL program. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is + * invalid. + */ +hsa_status_t HSA_API hsa_ext_program_destroy( + hsa_ext_program_t program); + +/** + * @brief Add a HSAIL module to an existing HSAIL program. + * + * @details The HSA runtime does not perform a deep copy of the HSAIL module + * upon addition. Instead, it stores a pointer to the HSAIL module. The + * ownership of the HSAIL module belongs to the application, which must ensure + * that @p module is not released before destroying the HSAIL program. + * + * The HSAIL module is successfully added to the HSAIL program if @p module is + * valid, if all the declarations and definitions for the same symbol are + * compatible, and if @p module specify machine model and profile that matches + * the HSAIL program. + * + * @param[in] program HSAIL program. + * + * @param[in] module HSAIL module. The application can add the same HSAIL module + * to @p program at most once. The HSAIL module must specify the same machine + * model and profile as @p program. If the floating-mode rounding mode of @p + * module is not default, then it should match that of @p program. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_MODULE The HSAIL module is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE The machine model of @p + * module does not match machine model of @p program, or the profile of @p + * module does not match profile of @p program. + * + * @retval ::HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED The HSAIL module is + * already a part of the HSAIL program. + * + * @retval ::HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH Symbol declaration and symbol + * definition compatibility mismatch. See the symbol compatibility rules in the + * HSA Programming Reference Manual. + */ +hsa_status_t HSA_API hsa_ext_program_add_module( + hsa_ext_program_t program, + hsa_ext_module_t module); + +/** + * @brief Iterate over the HSAIL modules in a program, and invoke an + * application-defined callback on every iteration. + * + * @param[in] program HSAIL program. + * + * @param[in] callback Callback to be invoked once per HSAIL module in the + * program. The HSA runtime passes three arguments to the callback: the program, + * a HSAIL module, and the application data. If @p callback returns a status + * other than ::HSA_STATUS_SUCCESS for a particular iteration, the traversal + * stops and ::hsa_ext_program_iterate_modules returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The program is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_iterate_modules( + hsa_ext_program_t program, + hsa_status_t (*callback)(hsa_ext_program_t program, hsa_ext_module_t module, + void* data), + void* data); + +/** + * @brief HSAIL program attributes. + */ +typedef enum { + /** + * Machine model specified when the HSAIL program was created. The type + * of this attribute is ::hsa_machine_model_t. + */ + HSA_EXT_PROGRAM_INFO_MACHINE_MODEL = 0, + /** + * Profile specified when the HSAIL program was created. The type of + * this attribute is ::hsa_profile_t. + */ + HSA_EXT_PROGRAM_INFO_PROFILE = 1, + /** + * Default float rounding mode specified when the HSAIL program was + * created. The type of this attribute is ::hsa_default_float_rounding_mode_t. + */ + HSA_EXT_PROGRAM_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 2 +} hsa_ext_program_info_t; + +/** + * @brief Get the current value of an attribute for a given HSAIL program. + * + * @param[in] program HSAIL program. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behaviour is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * HSAIL program attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_get_info( + hsa_ext_program_t program, + hsa_ext_program_info_t attribute, + void *value); + +/** + * @brief Finalizer-determined call convention. + */ +typedef enum { + /** + * Finalizer-determined call convention. + */ + HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO = -1 +} hsa_ext_finalizer_call_convention_t; + +/** + * @brief Control directives specify low-level information about the + * finalization process. + */ +typedef struct hsa_ext_control_directives_s { + /** + * Bitset indicating which control directives are enabled. The bit assigned to + * a control directive is determined by the corresponding value in + * BrigControlDirective. + * + * If a control directive is disabled, its corresponding field value (if any) + * must be 0. Control directives that are only present or absent (such as + * partial workgroups) have no corresponding field as the presence of the bit + * in this mask is sufficient. + */ + uint64_t control_directives_mask; + /** + * Bitset of HSAIL exceptions that must have the BREAK policy enabled. The bit + * assigned to an HSAIL exception is determined by the corresponding value + * in BrigExceptionsMask. If the kernel contains a enablebreakexceptions + * control directive, the finalizer uses the union of the two masks. + */ + uint16_t break_exceptions_mask; + /** + * Bitset of HSAIL exceptions that must have the DETECT policy enabled. The + * bit assigned to an HSAIL exception is determined by the corresponding value + * in BrigExceptionsMask. If the kernel contains a enabledetectexceptions + * control directive, the finalizer uses the union of the two masks. + */ + uint16_t detect_exceptions_mask; + /** + * Maximum size (in bytes) of dynamic group memory that will be allocated by + * the application for any dispatch of the kernel. If the kernel contains a + * maxdynamicsize control directive, the two values should match. + */ + uint32_t max_dynamic_group_size; + /** + * Maximum number of grid work-items that will be used by the application to + * launch the kernel. If the kernel contains a maxflatgridsize control + * directive, the value of @a max_flat_grid_size must not be greater than the + * value of the directive, and takes precedence. + * + * The value specified for maximum absolute grid size must be greater than or + * equal to the product of the values specified by @a required_grid_size. + * + * If the bit at position BRIG_CONTROL_MAXFLATGRIDSIZE is set in @a + * control_directives_mask, this field must be greater than 0. + */ + uint64_t max_flat_grid_size; + /** + * Maximum number of work-group work-items that will be used by the + * application to launch the kernel. If the kernel contains a + * maxflatworkgroupsize control directive, the value of @a + * max_flat_workgroup_size must not be greater than the value of the + * directive, and takes precedence. + * + * The value specified for maximum absolute grid size must be greater than or + * equal to the product of the values specified by @a required_workgroup_size. + * + * If the bit at position BRIG_CONTROL_MAXFLATWORKGROUPSIZE is set in @a + * control_directives_mask, this field must be greater than 0. + */ + uint32_t max_flat_workgroup_size; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + /** + * Grid size that will be used by the application in any dispatch of the + * kernel. If the kernel contains a requiredgridsize control directive, the + * dimensions should match. + * + * The specified grid size must be consistent with @a required_workgroup_size + * and @a required_dim. Also, the product of the three dimensions must not + * exceed @a max_flat_grid_size. Note that the listed invariants must hold + * only if all the corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDGRIDSIZE is set in @a + * control_directives_mask, the three dimension values must be greater than 0. + */ + uint64_t required_grid_size[3]; + /** + * Work-group size that will be used by the application in any dispatch of the + * kernel. If the kernel contains a requiredworkgroupsize control directive, + * the dimensions should match. + * + * The specified work-group size must be consistent with @a required_grid_size + * and @a required_dim. Also, the product of the three dimensions must not + * exceed @a max_flat_workgroup_size. Note that the listed invariants must + * hold only if all the corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDWORKGROUPSIZE is set in @a + * control_directives_mask, the three dimension values must be greater than 0. + */ + hsa_dim3_t required_workgroup_size; + /** + * Number of dimensions that will be used by the application to launch the + * kernel. If the kernel contains a requireddim control directive, the two + * values should match. + * + * The specified dimensions must be consistent with @a required_grid_size and + * @a required_workgroup_size. This invariant must hold only if all the + * corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDDIM is set in @a + * control_directives_mask, this field must be 1, 2, or 3. + */ + uint8_t required_dim; + /** + * Reserved. Must be 0. + */ + uint8_t reserved2[75]; +} hsa_ext_control_directives_t; + +/** + * @brief Finalize an HSAIL program for a given instruction set architecture. + * + * @details Finalize all of the kernels and indirect functions that belong to + * the same HSAIL program for a specific instruction set architecture (ISA). The + * transitive closure of all functions specified by call or scall must be + * defined. Kernels and indirect functions that are being finalized must be + * defined. Kernels and indirect functions that are referenced in kernels and + * indirect functions being finalized may or may not be defined, but must be + * declared. All the global/readonly segment variables that are referenced in + * kernels and indirect functions being finalized may or may not be defined, but + * must be declared. + * + * @param[in] program HSAIL program. + * + * @param[in] isa Instruction set architecture to finalize for. + * + * @param[in] call_convention A call convention used in a finalization. Must + * have a value between ::HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO (inclusive) + * and the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT in @p + * isa (not inclusive). + * + * @param[in] control_directives Low-level control directives that influence + * the finalization process. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[in] code_object_type Type of code object to produce. + * + * @param[out] code_object Code object generated by the Finalizer, which + * contains the machine code for the kernels and indirect functions in the HSAIL + * program. The code object is independent of the HSAIL module that was used to + * generate it. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p isa is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH The directive in + * the control directive structure and in the HSAIL kernel mismatch, or if the + * same directive is used with a different value in one of the functions used by + * this kernel. + * + * @retval ::HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED The Finalizer + * encountered an error while compiling a kernel or an indirect function. + */ +hsa_status_t HSA_API hsa_ext_program_finalize( + hsa_ext_program_t program, + hsa_isa_t isa, + int32_t call_convention, + hsa_ext_control_directives_t control_directives, + const char *options, + hsa_code_object_type_t code_object_type, + hsa_code_object_t *code_object); + +/** @} */ + +#define hsa_ext_finalizer_1_00 + +typedef struct hsa_ext_finalizer_1_00_pfn_s { + hsa_status_t (*hsa_ext_program_create)( + hsa_machine_model_t machine_model, hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, hsa_ext_program_t *program); + + hsa_status_t (*hsa_ext_program_destroy)(hsa_ext_program_t program); + + hsa_status_t (*hsa_ext_program_add_module)(hsa_ext_program_t program, + hsa_ext_module_t module); + + hsa_status_t (*hsa_ext_program_iterate_modules)( + hsa_ext_program_t program, + hsa_status_t (*callback)(hsa_ext_program_t program, + hsa_ext_module_t module, void *data), + void *data); + + hsa_status_t (*hsa_ext_program_get_info)( + hsa_ext_program_t program, hsa_ext_program_info_t attribute, + void *value); + + hsa_status_t (*hsa_ext_program_finalize)( + hsa_ext_program_t program, hsa_isa_t isa, int32_t call_convention, + hsa_ext_control_directives_t control_directives, const char *options, + hsa_code_object_type_t code_object_type, hsa_code_object_t *code_object); +} hsa_ext_finalizer_1_00_pfn_t; + +#ifdef __cplusplus +} // extern "C" block +#endif // __cplusplus + +#endif // HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_image.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_image.h new file mode 100644 index 0000000000..cad9b50820 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ext_image.h @@ -0,0 +1,1515 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_EXT_IMAGE_H +#define HSA_EXT_IMAGE_H + +#include "hsa.h" + +#undef HSA_API +#ifdef HSA_EXPORT_IMAGES +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +#ifdef __cplusplus +extern "C" { +#endif /*__cplusplus*/ + +/** \defgroup ext-images Images and Samplers + * @{ + */ + +/** + * @brief Enumeration constants added to ::hsa_status_t by this extension. + * + * @remark Additions to hsa_status_t + */ +enum { + /** + * Image format is not supported. + */ + HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED = 0x3000, + /** + * Image size is not supported. + */ + HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED = 0x3001, + /** + * Image pitch is not supported or invalid. + */ + HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED = 0x3002, + /** + * Sampler descriptor is not supported or invalid. + */ + HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED = 0x3003 +}; + +/** + * @brief Enumeration constants added to ::hsa_agent_info_t by this + * extension. + * + * @remark Additions to hsa_agent_info_t + */ +enum { + /** + * Maximum number of elements in 1D images. Must be at least 16384. The type + * of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS = 0x3000, + /** + * Maximum number of elements in 1DA images. Must be at least 16384. The type + * of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS = 0x3001, + /** + * Maximum number of elements in 1DB images. Must be at least 65536. The type + * of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS = 0x3002, + /** + * Maximum dimensions (width, height) of 2D images, in image elements. The X + * and Y maximums must be at least 16384. The type of this attribute is + * size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS = 0x3003, + /** + * Maximum dimensions (width, height) of 2DA images, in image elements. The X + * and Y maximums must be at least 16384. The type of this attribute is + * size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS = 0x3004, + /** + * Maximum dimensions (width, height) of 2DDEPTH images, in image + * elements. The X and Y maximums must be at least 16384. The type of this + * attribute is size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DDEPTH_MAX_ELEMENTS = 0x3005, + /** + * Maximum dimensions (width, height) of 2DADEPTH images, in image + * elements. The X and Y maximums must be at least 16384. The type of this + * attribute is size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS = 0x3006, + /** + * Maximum dimensions (width, height, depth) of 3D images, in image + * elements. The maximum along any dimension must be at least 2048. The type + * of this attribute is size_t[3]. + */ + HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS = 0x3007, + /** + * Maximum number of image layers in a image array. Must be at least 2048. The + * type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS = 0x3008, + /** + * Maximum number of read-only image handles that can be created for an agent at any one + * time. Must be at least 128. The type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES = 0x3009, + /** + * Maximum number of write-only and read-write image handles (combined) that + * can be created for an agent at any one time. Must be at least 64. The type of this + * attribute is size_t. + */ + HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES = 0x300A, + /** + * Maximum number of sampler handlers that can be created for an agent at any one + * time. Must be at least 16. The type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS = 0x300B, + /** + * Image pitch alignment. The agent only supports linear image data + * layouts with a row pitch that is a multiple of this value. Must be + * a power of 2. The type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_LINEAR_ROW_PITCH_ALIGNMENT = 0x300C +}; + +/** + * @brief Image handle, populated by ::hsa_ext_image_create or + * ::hsa_ext_image_create_with_layout. Image + * handles are only unique within an agent, not across agents. + * + */ +typedef struct hsa_ext_image_s { + /** + * Opaque handle. For a given agent, two handles reference the same object of + * the enclosing type if and only if they are equal. + */ + uint64_t handle; + +} hsa_ext_image_t; + +/** + * @brief Geometry associated with the image. This specifies the + * number of image dimensions and whether the image is an image + * array. See the Image Geometry section in the HSA + * Programming Reference Manual for definitions on each + * geometry. The enumeration values match the BRIG type @p + * hsa_ext_brig_image_geometry_t. + */ +typedef enum { +/** + * One-dimensional image addressed by width coordinate. + */ + HSA_EXT_IMAGE_GEOMETRY_1D = 0, + + /** + * Two-dimensional image addressed by width and height coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2D = 1, + + /** + * Three-dimensional image addressed by width, height, and depth coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_3D = 2, + + /** + * Array of one-dimensional images with the same size and format. 1D arrays + * are addressed by width and index coordinate. + */ + HSA_EXT_IMAGE_GEOMETRY_1DA = 3, + + /** + * Array of two-dimensional images with the same size and format. 2D arrays + * are addressed by width, height, and index coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DA = 4, + + /** + * One-dimensional image addressed by width coordinate. It has + * specific restrictions compared to ::HSA_EXT_IMAGE_GEOMETRY_1D. An + * image with an opaque image data layout will always use a linear + * image data layout, and one with an explicit image data layout + * must specify ::HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR. + */ + HSA_EXT_IMAGE_GEOMETRY_1DB = 5, + + /** + * Two-dimensional depth image addressed by width and height coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DDEPTH = 6, + + /** + * Array of two-dimensional depth images with the same size and format. 2D + * arrays are addressed by width, height, and index coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DADEPTH = 7 +} hsa_ext_image_geometry_t; + +/** + * @brief Channel type associated with the elements of an image. See + * the Channel Type section in the HSA Programming Reference + * Manual for definitions on each channel type. The + * enumeration values and definition match the BRIG type @p + * hsa_ext_brig_image_channel_type_t. + */ +typedef enum { + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0, + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14, + HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15 +} hsa_ext_image_channel_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_image_channel_type_t constants. + */ +typedef uint32_t hsa_ext_image_channel_type32_t; + +/** + * + * @brief Channel order associated with the elements of an image. See + * the Channel Order section in the HSA Programming Reference + * Manual for definitions on each channel order. The + * enumeration values match the BRIG type @p + * hsa_ext_brig_image_channel_order_t. + */ +typedef enum { + HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0, + HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1, + HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2, + HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4, + HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8, + HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9, + HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10, + HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14, + HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15, + HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16, + HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19 +} hsa_ext_image_channel_order_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_image_channel_order_t constants. + */ +typedef uint32_t hsa_ext_image_channel_order32_t; + + +/** + * @brief Image format. + */ +typedef struct hsa_ext_image_format_s { + /** + * Channel type. + */ + hsa_ext_image_channel_type32_t channel_type; + + /** + * Channel order. + */ + hsa_ext_image_channel_order32_t channel_order; +} hsa_ext_image_format_t; + +/** + * @brief Implementation independent image descriptor. + */ +typedef struct hsa_ext_image_descriptor_s { + /** + * Image geometry. + */ + hsa_ext_image_geometry_t geometry; + /** + * Width of the image, in components. + */ + size_t width; + /** + * Height of the image, in components. Only used if the geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_2D, ::HSA_EXT_IMAGE_GEOMETRY_3D, + * HSA_EXT_IMAGE_GEOMETRY_2DA, HSA_EXT_IMAGE_GEOMETRY_2DDEPTH, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0. + */ + size_t height; + /** + * Depth of the image, in components. Only used if the geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D, otherwise must be 0. + */ + size_t depth; + /** + * Number of image layers in the image array. Only used if the geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0. + */ + size_t array_size; + /** + * Image format. + */ + hsa_ext_image_format_t format; +} hsa_ext_image_descriptor_t; + +/** + * @brief Image capability. + */ +typedef enum { + /** + * Images of this geometry, format, and layout are not supported by + * the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED = 0x0, + /** + * Read-only images of this geometry, format, and layout are + * supported by the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_ONLY = 0x1, + /** + * Write-only images of this geometry, format, and layout are + * supported by the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_WRITE_ONLY = 0x2, + /** + * Read-write images of this geometry, format, and layout are + * supported by the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_WRITE = 0x4, + /** + * @deprecated Images of this geometry, format, and layout can be accessed from + * read-modify-write atomic operations in the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_MODIFY_WRITE = 0x8, + /** + * Images of this geometry, format, and layout are guaranteed to + * have a consistent data layout regardless of how they are + * accessed by the associated agent. + */ + HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT = 0x10 +} hsa_ext_image_capability_t; + +/** + * @brief Image data layout. + * + * @details An image data layout denotes such aspects of image data + * layout as tiling and organization of channels in memory. Some image + * data layouts may only apply to specific image geometries, formats, + * and access permissions. Different agents may support different + * image layout identifiers, including vendor specific layouts. Note + * that an agent may not support the same image data layout for + * different access permissions to images with the same image + * geometry, size, and format. If multiple agents support the same + * image data layout then it is possible to use separate image handles + * for each agent that references the same image data. + */ + +typedef enum { + /** + * An implementation specific opaque image data layout which can + * vary depending on the agent, geometry, image format, image size, + * and access permissions. + */ + HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE = 0x0, + /** + * The image data layout is specified by the following rules in + * ascending byte address order. For a 3D image, 2DA image array, + * or 1DA image array, the image data is stored as a linear sequence + * of adjacent 2D image slices, 2D images, or 1D images + * respectively, spaced according to the slice pitch. Each 2D image + * is stored as a linear sequence of adjacent image rows, spaced + * according to the row pitch. Each 1D or 1DB image is stored as a + * single image row. Each image row is stored as a linear sequence + * of image elements. Each image element is stored as a linear + * sequence of image components specified by the left to right + * channel order definition. Each image component is stored using + * the memory type specified by the channel type. + * + * The 1DB image geometry always uses the linear image data layout. + */ + HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR = 0x1 +} hsa_ext_image_data_layout_t; + +/** + * @brief Retrieve the supported image capabilities for a given combination of + * agent, geometry, and image format for an image created with an opaque image + * data layout. + * + * @param[in] agent Agent to be associated with the image handle. + * + * @param[in] geometry Geometry. + * + * @param[in] image_format Pointer to an image format. Must not be NULL. + * + * @param[out] capability_mask Pointer to a memory location where the HSA + * runtime stores a bit-mask of supported image capability + * (::hsa_ext_image_capability_t) values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is + * NULL, or @p capability_mask is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_get_capability( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + +/** + * @brief Retrieve the supported image capabilities for a given combination of + * agent, geometry, image format, and image layout for an image created with + * an explicit image data layout. + * + * @param[in] agent Agent to be associated with the image handle. + * + * @param[in] geometry Geometry. + * + * @param[in] image_format Pointer to an image format. Must not be NULL. + * + * @param[in] image_data_layout The image data layout. + * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use + * ::hsa_ext_image_get_capability instead. + * + * @param[out] capability_mask Pointer to a memory location where the HSA + * runtime stores a bit-mask of supported image capability + * (::hsa_ext_image_capability_t) values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is + * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE, + * or @p capability_mask is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_get_capability_with_layout( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + hsa_ext_image_data_layout_t image_data_layout, + uint32_t *capability_mask); + +/** + * @brief Agent specific image size and alignment requirements, populated by + * ::hsa_ext_image_data_get_info and ::hsa_ext_image_data_get_info_with_layout. + */ +typedef struct hsa_ext_image_data_info_s { + /** + * Image data size, in bytes. + */ + size_t size; + + /** + * Image data alignment, in bytes. Must always be a power of 2. + */ + size_t alignment; + +} hsa_ext_image_data_info_t; + +/** + * @brief Retrieve the image data requirements for a given combination of agent, image + * descriptor, and access permission for an image created with an opaque image + * data layout. + * + * @details The optimal image data size and alignment requirements may + * vary depending on the image attributes specified in @p + * image_descriptor, the @p access_permission, and the @p agent. Also, + * different implementations of the HSA runtime may return different + * requirements for the same input values. + * + * The implementation must return the same image data requirements for + * different access permissions with matching image descriptors as long + * as ::hsa_ext_image_get_capability reports + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image + * descriptors match if they have the same values, with the exception + * that s-form channel orders match the corresponding non-s-form + * channel order and vice versa. + * + * @param[in] agent Agent to be associated with the image handle. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] access_permission Access permission of the image when + * accessed by @p agent. The access permission defines how the agent + * is allowed to access the image and must match the corresponding + * HSAIL image handle type. The @p agent must support the image format + * specified in @p image_descriptor for the given @p + * access_permission. + * + * @param[out] image_data_info Memory location where the runtime stores the + * size and alignment requirements. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The @p + * agent does not support the image format specified by @p + * image_descriptor with the specified @p access_permission. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent + * does not support the image dimensions specified by @p + * image_descriptor with the specified @p access_permission. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * access_permission is not a valid access permission value, or @p + * image_data_info is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_data_get_info( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + +/** + * @brief Retrieve the image data requirements for a given combination of + * image descriptor, access permission, image data layout, image data row pitch, + * and image data slice pitch for an image created with an explicit image + * data layout. + * + * @details The image data size and alignment requirements may vary + * depending on the image attributes specified in @p image_descriptor, + * the @p access_permission, and the image layout. However, different + * implementations of the HSA runtime will return the same + * requirements for the same input values. + * + * The implementation must return the same image data requirements for + * different access permissions with matching image descriptors and + * matching image layouts as long as ::hsa_ext_image_get_capability + * reports + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image + * descriptors match if they have the same values, with the exception + * that s-form channel orders match the corresponding non-s-form + * channel order and vice versa. Image layouts match if they are the + * same image data layout and use the same image row and slice pitch + * values. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] access_permission Access permission of the image when + * accessed by an agent. The access permission defines how the agent + * is allowed to access the image and must match the corresponding + * HSAIL image handle type. + * + * @param[in] image_data_layout The image data layout to use. + * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use + * ::hsa_ext_image_data_get_info instead. + * + * @param[in] image_data_row_pitch The size in bytes for a single row + * of the image in the image data. If 0 is specified then the default + * row pitch value is used: image width * image element byte size. + * The value used must be greater than or equal to the default row + * pitch, and be a multiple of the image element byte size. For the + * linear image layout it must also be a multiple of the image linear + * row pitch alignment for the agents that will access the image data + * using image instructions. + * + * @param[in] image_data_slice_pitch The size in bytes of a single + * slice of a 3D image, or the size in bytes of each image layer in an + * image array in the image data. If 0 is specified then the default + * slice pitch value is used: row pitch * height if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must + * be 0 if the default slice pitch is 0, be greater than or equal to + * the default slice pitch, and be a multiple of the row pitch. + * + * @param[out] image_data_info Memory location where the runtime stores the + * size and alignment requirements. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The image + * format specified by @p image_descriptor is not supported for the + * @p access_permission and @p image_data_layout specified. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The image + * dimensions specified by @p image_descriptor are not supported for + * the @p access_permission and @p image_data_layout specified. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The row and + * slice pitch specified by @p image_data_row_pitch and @p + * image_data_slice_pitch are invalid or not supported. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is + * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE, + * or @p image_data_info is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_data_get_info_with_layout( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_data_info_t *image_data_info); + +/** + * @brief Creates an agent specific image handle to an image with an + * opaque image data layout. + * + * @details Images with an opaque image data layout created with + * different access permissions but matching image descriptors and + * same agent can share the same image data if + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported + * by ::hsa_ext_image_get_capability for the image format specified in + * the image descriptor. Image descriptors match if they have the same + * values, with the exception that s-form channel orders match the + * corresponding non-s-form channel order and vice versa. + * + * If necessary, an application can use image operations (import, + * export, copy, clear) to prepare the image for the intended use + * regardless of the access permissions. + * + * @param[in] agent agent to be associated with the image handle created. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] image_data Image data buffer that must have been allocated + * according to the size and alignment requirements dictated by + * ::hsa_ext_image_data_get_info. Must not be NULL. + * + * Any previous memory contents are preserved upon creation. The application is + * responsible for ensuring that the lifetime of the image data exceeds that of + * all the associated images. + * + * @param[in] access_permission Access permission of the image when + * accessed by agent. The access permission defines how the agent + * is allowed to access the image using the image handle created and + * must match the corresponding HSAIL image handle type. The agent + * must support the image format specified in @p image_descriptor for + * the given @p access_permission. + * + * @param[out] image Pointer to a memory location where the HSA runtime stores + * the newly created image handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent + * does not have the capability to support the image format contained + * in @p image_descriptor using the specified @p access_permission. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent + * does not support the image dimensions specified by @p + * image_descriptor using the specified @p access_permission. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * support the creation of more image handles with the given @p access_permission). + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * image_data is NULL, @p image_data does not have a valid alignment, + * @p access_permission is not a valid access permission + * value, or @p image is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_create( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + +/** + * @brief Creates an agent specific image handle to an image with an explicit + * image data layout. + * + * @details Images with an explicit image data layout created with + * different access permissions but matching image descriptors and + * matching image layout can share the same image data if + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported + * by ::hsa_ext_image_get_capability_with_layout for the image format + * specified in the image descriptor and specified image data + * layout. Image descriptors match if they have the same values, with + * the exception that s-form channel orders match the corresponding + * non-s-form channel order and vice versa. Image layouts match if + * they are the same image data layout and use the same image row and + * slice values. + * + * If necessary, an application can use image operations (import, export, copy, + * clear) to prepare the image for the intended use regardless of the access + * permissions. + * + * @param[in] agent agent to be associated with the image handle created. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] image_data Image data buffer that must have been allocated + * according to the size and alignment requirements dictated by + * ::hsa_ext_image_data_get_info_with_layout. Must not be NULL. + * + * Any previous memory contents are preserved upon creation. The application is + * responsible for ensuring that the lifetime of the image data exceeds that of + * all the associated images. + * + * @param[in] access_permission Access permission of the image when + * accessed by the agent. The access permission defines how the agent + * is allowed to access the image and must match the corresponding + * HSAIL image handle type. The agent must support the image format + * specified in @p image_descriptor for the given @p access_permission + * and @p image_data_layout. + * + * @param[in] image_data_layout The image data layout to use for the + * @p image_data. It is invalid to use + * ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use ::hsa_ext_image_create + * instead. + * + * @param[in] image_data_row_pitch The size in bytes for a single row + * of the image in the image data. If 0 is specified then the default + * row pitch value is used: image width * image element byte size. + * The value used must be greater than or equal to the default row + * pitch, and be a multiple of the image element byte size. For the + * linear image layout it must also be a multiple of the image linear + * row pitch alignment for the agents that will access the image data + * using image instructions. + * + * @param[in] image_data_slice_pitch The size in bytes of a single + * slice of a 3D image, or the size in bytes of each image layer in an + * image array in the image data. If 0 is specified then the default + * slice pitch value is used: row pitch * height if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must + * be 0 if the default slice pitch is 0, be greater than or equal to + * the default slice pitch, and be a multiple of the row pitch. + * + * @param[out] image Pointer to a memory location where the HSA runtime stores + * the newly created image handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent does + * not have the capability to support the image format contained in the image + * descriptor using the specified @p access_permission and @p image_data_layout. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent + * does not support the image dimensions specified by @p + * image_descriptor using the specified @p access_permission and @p + * image_data_layout. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The agent does + * not support the row and slice pitch specified by @p image_data_row_pitch + * and @p image_data_slice_pitch, or the values are invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * support the creation of more image handles with the given @p access_permission). + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * image_data is NULL, @p image_data does not have a valid alignment, + * @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE, + * or @p image is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_create_with_layout( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_t *image); + +/** + * @brief Destroy an image handle previously created using ::hsa_ext_image_create or + * ::hsa_ext_image_create_with_layout. + * + * @details Destroying the image handle does not free the associated image data, + * or modify its contents. The application should not destroy an image handle while + * there are references to it queued for execution or currently being used in a + * kernel dispatch. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] image Image handle to destroy. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + */ +hsa_status_t HSA_API hsa_ext_image_destroy( + hsa_agent_t agent, + hsa_ext_image_t image); + +/** + * @brief Copies a portion of one image (the source) to another image (the + * destination). + * + * @details The source and destination image formats should be the + * same, with the exception that s-form channel orders match the + * corresponding non-s-form channel order and vice versa. For example, + * it is allowed to copy a source image with a channel order of + * HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB to a destination image with a + * channel order of HSA_EXT_IMAGE_CHANNEL_ORDER_RGB. + * + * The source and destination images do not have to be of the same geometry and + * appropriate scaling is performed by the HSA runtime. It is possible to copy + * subregions between any combinations of source and destination geometries, provided + * that the dimensions of the subregions are the same. For example, it is + * allowed to copy a rectangular region from a 2D image to a slice of a 3D + * image. + * + * If the source and destination image data overlap, or the combination of + * offset and range references an out-out-bounds element in any of the images, + * the behavior is undefined. + * + * @param[in] agent Agent associated with both the source and destination image handles. + * + * @param[in] src_image Image handle of source image. The agent associated with the source + * image handle must be identical to that of the destination image. + * + * @param[in] src_offset Pointer to the offset within the source image where to + * copy the data from. Must not be NULL. + * + * @param[in] dst_image Image handle of destination image. + * + * @param[in] dst_offset Pointer to the offset within the destination + * image where to copy the data. Must not be NULL. + * + * @param[in] range Dimensions of the image portion to be copied. The HSA + * runtime computes the size of the image data to be copied using this + * argument. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_offset is + * NULL, @p dst_offset is NULL, or @p range is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_copy( + hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range); + +/** + * @brief Image region. + */ +typedef struct hsa_ext_image_region_s { + /** + * Offset within an image (in coordinates). + */ + hsa_dim3_t offset; + + /** + * Dimension size of the image range (in coordinates). The x, y, and z dimensions + * correspond to width, height, and depth or index respectively. + */ + hsa_dim3_t range; +} hsa_ext_image_region_t; + +/** + * @brief Import a linearly organized image data from memory directly to an + * image handle. + * + * @details This operation updates the image data referenced by the image handle + * from the source memory. The size of the data imported from memory is + * implicitly derived from the image region. + * + * It is the application's responsibility to avoid out of bounds memory access. + * + * None of the source memory or destination image data memory can + * overlap. Overlapping of any of the source and destination image + * data memory within the import operation produces undefined results. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] src_memory Source memory. Must not be NULL. + * + * @param[in] src_row_pitch The size in bytes of a single row of the image in the + * source memory. If the value is smaller than the destination image region + * width * image element byte size, then region width * image element byte + * size is used. + * + * @param[in] src_slice_pitch The size in bytes of a single 2D slice of a 3D image, + * or the size in bytes of each image layer in an image array in the source memory. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the + * value used for @p src_row_pitch, then the value used for @p src_row_pitch is used. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for + * @p src_row_pitch * destination image region height, then the value used for + * @p src_row_pitch * destination image region height is used. + * Otherwise, the value is not used. + * + * @param[in] dst_image Image handle of destination image. + * + * @param[in] image_region Pointer to the image region to be updated. Must not + * be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_memory is NULL, or @p + * image_region is NULL. + * + */ +hsa_status_t HSA_API hsa_ext_image_import( + hsa_agent_t agent, + const void *src_memory, + size_t src_row_pitch, + size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Export the image data to linearly organized memory. + * + * @details The operation updates the destination memory with the image data of + * @p src_image. The size of the data exported to memory is implicitly derived + * from the image region. + * + * It is the application's responsibility to avoid out of bounds memory access. + * + * None of the destination memory or source image data memory can + * overlap. Overlapping of any of the source and destination image + * data memory within the export operation produces undefined results. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] src_image Image handle of source image. + * + * @param[in] dst_memory Destination memory. Must not be NULL. + * + * @param[in] dst_row_pitch The size in bytes of a single row of the image in the + * destination memory. If the value is smaller than the source image region + * width * image element byte size, then region width * image element byte + * size is used. + * + * @param[in] dst_slice_pitch The size in bytes of a single 2D slice of a 3D image, + * or the size in bytes of each image in an image array in the destination memory. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the + * value used for @p dst_row_pitch, then the value used for @p dst_row_pitch is used. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for + * @p dst_row_pitch * source image region height, then the value used for + * @p dst_row_pitch * source image region height is used. + * Otherwise, the value is not used. + * + * @param[in] image_region Pointer to the image region to be exported. Must not + * be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p dst_memory is NULL, or @p + * image_region is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_export( + hsa_agent_t agent, + hsa_ext_image_t src_image, + void *dst_memory, + size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Clear a region of an image so that every image element has + * the specified value. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] image Image handle for image to be cleared. + * + * @param[in] data The value to which to set each image element being + * cleared. It is specified as an array of image component values. The + * number of array elements must match the number of access components + * for the image channel order. The type of each array element must + * match the image access type of the image channel type. When the + * value is used to set the value of an image element, the conversion + * method corresponding to the image channel type is used. See the + * Channel Order section and Channel Type section in + * the HSA Programming Reference Manual for more + * information. Must not be NULL. + * + * @param[in] image_region Pointer to the image region to clear. Must not be + * NULL. If the region references an out-out-bounds element, the behavior is + * undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p data is NULL, or @p + * image_region is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_clear( + hsa_agent_t agent, + hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Sampler handle. Samplers are populated by + * ::hsa_ext_sampler_create or ::hsa_ext_sampler_create_v2. Sampler handles are only unique + * within an agent, not across agents. + */ +typedef struct hsa_ext_sampler_s { + /** + * Opaque handle. For a given agent, two handles reference the same object of + * the enclosing type if and only if they are equal. + */ + uint64_t handle; +} hsa_ext_sampler_t; + +/** + * @brief Sampler address modes. The sampler address mode describes + * the processing of out-of-range image coordinates. See the + * Addressing Mode section in the HSA Programming Reference + * Manual for definitions on each address mode. The values + * match the BRIG type @p hsa_ext_brig_sampler_addressing_t. + */ +typedef enum { + /** + * Out-of-range coordinates are not handled. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED = 0, + + /** + * Clamp out-of-range coordinates to the image edge. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE = 1, + + /** + * Clamp out-of-range coordinates to the image border color. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER = 2, + + /** + * Wrap out-of-range coordinates back into the valid coordinate + * range so the image appears as repeated tiles. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT = 3, + + /** + * Mirror out-of-range coordinates back into the valid coordinate + * range so the image appears as repeated tiles with every other + * tile a reflection. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT = 4 + +} hsa_ext_sampler_addressing_mode_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_sampler_addressing_mode_t constants. + */ +typedef uint32_t hsa_ext_sampler_addressing_mode32_t; + +/** + * @brief Sampler coordinate normalization modes. See the + * Coordinate Normalization Mode section in the HSA + * Programming Reference Manual for definitions on each + * coordinate normalization mode. The values match the BRIG type @p + * hsa_ext_brig_sampler_coord_normalization_t. + */ +typedef enum { + + /** + * Coordinates are used to directly address an image element. + */ + HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED = 0, + + /** + * Coordinates are scaled by the image dimension size before being + * used to address an image element. + */ + HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED = 1 + +} hsa_ext_sampler_coordinate_mode_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_sampler_coordinate_mode_t constants. + */ +typedef uint32_t hsa_ext_sampler_coordinate_mode32_t; + + +/** + * @brief Sampler filter modes. See the Filter Mode section + * in the HSA Programming Reference Manual for definitions + * on each address mode. The enumeration values match the BRIG type @p + * hsa_ext_brig_sampler_filter_t. + */ +typedef enum { + /** + * Filter to the image element nearest (in Manhattan distance) to the + * specified coordinate. + */ + HSA_EXT_SAMPLER_FILTER_MODE_NEAREST = 0, + + /** + * Filter to the image element calculated by combining the elements in a 2x2 + * square block or 2x2x2 cube block around the specified coordinate. The + * elements are combined using linear interpolation. + */ + HSA_EXT_SAMPLER_FILTER_MODE_LINEAR = 1 + +} hsa_ext_sampler_filter_mode_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_sampler_filter_mode_t constants. + */ +typedef uint32_t hsa_ext_sampler_filter_mode32_t; + +/** + * @brief Implementation independent sampler descriptor. + */ +typedef struct hsa_ext_sampler_descriptor_s { + /** + * Sampler coordinate mode describes the normalization of image coordinates. + */ + hsa_ext_sampler_coordinate_mode32_t coordinate_mode; + + /** + * Sampler filter type describes the type of sampling performed. + */ + hsa_ext_sampler_filter_mode32_t filter_mode; + + /** + * Sampler address mode describes the processing of out-of-range image + * coordinates. + */ + hsa_ext_sampler_addressing_mode32_t address_mode; +} hsa_ext_sampler_descriptor_t; + +/** + * @brief Implementation independent sampler descriptor v2 which supports + * different address modes in X, Y and Z axises. + */ +typedef struct hsa_ext_sampler_descriptor_v2_s { + /** + * Sampler coordinate mode describes the normalization of image coordinates. + */ + hsa_ext_sampler_coordinate_mode32_t coordinate_mode; + + /** + * Sampler filter type describes the type of sampling performed. + */ + hsa_ext_sampler_filter_mode32_t filter_mode; + + /** + * Sampler address mode describes the processing of out-of-range image + * coordinates. + */ + hsa_ext_sampler_addressing_mode32_t address_modes[3]; // in X, Y and Z axises +} hsa_ext_sampler_descriptor_v2_t; + +/** + * @brief Create an agent specific sampler handle for a given agent + * independent sampler descriptor and agent. + * + * @param[in] agent Agent to be associated with the sampler handle created. + * + * @param[in] sampler_descriptor Pointer to a sampler descriptor. Must not be + * NULL. + * + * @param[out] sampler Memory location where the HSA runtime stores the newly + * created sampler handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED The + * @p agent does not have the capability to support the properties + * specified by @p sampler_descriptor or it is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or + * @p sampler is NULL. + */ +hsa_status_t HSA_API hsa_ext_sampler_create( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + +/** + * @brief Create an agent specific sampler handle for a given agent + * independent sampler descriptor v2 and agent. + * + * @param[in] agent Agent to be associated with the sampler handle created. + * + * @param[in] sampler_descriptor v2 Pointer to a sampler descriptor. Must not be + * NULL. + * + * @param[out] sampler Memory location where the HSA runtime stores the newly + * created sampler handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED The + * @p agent does not have the capability to support the properties + * specified by @p sampler_descriptor or it is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or + * @p sampler is NULL. + */ +hsa_status_t HSA_API hsa_ext_sampler_create_v2( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_v2_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + +/** + * @brief Destroy a sampler handle previously created using ::hsa_ext_sampler_create or + * ::hsa_ext_sampler_create_v2. + * + * @details The sampler handle should not be destroyed while there are + * references to it queued for execution or currently being used in a + * kernel dispatch. + * + * @param[in] agent Agent associated with the sampler handle. + * + * @param[in] sampler Sampler handle to destroy. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + */ +hsa_status_t HSA_API hsa_ext_sampler_destroy( + hsa_agent_t agent, + hsa_ext_sampler_t sampler); + + +#define hsa_ext_images_1_00 + +/** + * @brief The function pointer table for the images v1.00 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ext_images_1_00_pfn_s { + + hsa_status_t (*hsa_ext_image_get_capability)( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + + hsa_status_t (*hsa_ext_image_destroy)( + hsa_agent_t agent, + hsa_ext_image_t image); + + hsa_status_t (*hsa_ext_image_copy)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range); + + hsa_status_t (*hsa_ext_image_import)( + hsa_agent_t agent, + const void *src_memory, + size_t src_row_pitch, + size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_export)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + void *dst_memory, + size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_clear)( + hsa_agent_t agent, + hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_sampler_create)( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + + hsa_status_t (*hsa_ext_sampler_destroy)( + hsa_agent_t agent, + hsa_ext_sampler_t sampler); + +} hsa_ext_images_1_00_pfn_t; + +#define hsa_ext_images_1 + +/** + * @brief The function pointer table for the images v1 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ext_images_1_pfn_s { + + hsa_status_t (*hsa_ext_image_get_capability)( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + + hsa_status_t (*hsa_ext_image_destroy)( + hsa_agent_t agent, + hsa_ext_image_t image); + + hsa_status_t (*hsa_ext_image_copy)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range); + + hsa_status_t (*hsa_ext_image_import)( + hsa_agent_t agent, + const void *src_memory, + size_t src_row_pitch, + size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_export)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + void *dst_memory, + size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_clear)( + hsa_agent_t agent, + hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_sampler_create)( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + + hsa_status_t (*hsa_ext_sampler_destroy)( + hsa_agent_t agent, + hsa_ext_sampler_t sampler); + + hsa_status_t (*hsa_ext_image_get_capability_with_layout)( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + hsa_ext_image_data_layout_t image_data_layout, + uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info_with_layout)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create_with_layout)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_t *image); + + hsa_status_t (*hsa_ext_sampler_create_v2)( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_v2_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + +} hsa_ext_images_1_pfn_t; +/** @} */ + +#ifdef __cplusplus +} // end extern "C" block +#endif /*__cplusplus*/ + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_aqlprofile.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_aqlprofile.h new file mode 100644 index 0000000000..a49221c49e --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_aqlprofile.h @@ -0,0 +1,488 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_ +#define OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_ + +#include +#include "hsa.h" + +#define HSA_AQLPROFILE_VERSION_MAJOR 2 +#define HSA_AQLPROFILE_VERSION_MINOR 0 + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +//////////////////////////////////////////////////////////////////////////////// +// Library version +uint32_t hsa_ven_amd_aqlprofile_version_major(); +uint32_t hsa_ven_amd_aqlprofile_version_minor(); + +/////////////////////////////////////////////////////////////////////// +// Library API: +// The library provides helper methods for instantiation of +// the profile context object and for populating of the start +// and stop AQL packets. The profile object contains a profiling +// events list and needed for profiling buffers descriptors, +// a command buffer and an output data buffer. To check if there +// was an error the library methods return a status code. Also +// the library provides methods for querying required buffers +// attributes, to validate the event attributes and to get profiling +// output data. +// +// Returned status: +// hsa_status_t – HSA status codes are used from hsa.h header +// +// Supported profiling features: +// +// Supported profiling events +typedef enum { + HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC = 0, + HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE = 1, +} hsa_ven_amd_aqlprofile_event_type_t; + +// Supported performance counters (PMC) blocks +// The block ID is the same for a block instances set, for example +// each block instance from the TCC block set, TCC0, TCC1, …, TCCN +// will have the same block ID HSA_VEN_AMD_AQLPROFILE_BLOCKS_TCC. +typedef enum { + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC = 0, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF = 1, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS = 2, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM = 3, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE = 4, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI = 5, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ = 6, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS = 7, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM = 8, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX = 9, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA = 10, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA = 11, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC = 12, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP = 13, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD = 14, + // Memory related blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB = 15, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB = 16, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM = 17, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ = 18, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2 = 19, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR = 20, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC = 21, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2 = 22, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA = 23, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB = 24, + // System blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA = 25, + // GFX10 added blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1A = 26, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL1C = 27, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2A = 28, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GL2C = 29, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR = 30, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS = 31, + + // UMC & MMEA System Blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC = 32, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA = 33, + + HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER +} hsa_ven_amd_aqlprofile_block_name_t; + +// PMC event object structure +// ‘counter_id’ value is specified in GFXIPs perfcounter user guides +// which is the counters select value, “Performance Counters Selection” +// chapter. +typedef struct { + hsa_ven_amd_aqlprofile_block_name_t block_name; + uint32_t block_index; + uint32_t counter_id; +} hsa_ven_amd_aqlprofile_event_t; + +// Check if event is valid for the specific GPU +hsa_status_t hsa_ven_amd_aqlprofile_validate_event( + hsa_agent_t agent, // HSA handle for the profiling GPU + const hsa_ven_amd_aqlprofile_event_t* event, // [in] Pointer on validated event + bool* result); // [out] True if the event valid, False otherwise + +// Profiling parameters +// All parameters are generic and if not applicable for a specific +// profile configuration then error status will be returned. +typedef enum { + /** + * Select the target compute unit (wgp) for profiling. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET = 0, + /** + * VMID Mask + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK = 1, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK = 2, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK = 3, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2 = 4, + /** + * Shader engine mask for selection. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK = 5, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE = 6, + /** + * Legacy. Deprecated. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT = 7, + /** + * Set SIMD Mask (GFX9) or SIMD ID for collection (Navi) + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION = 8, + /** + * Set true for occupancy collection only. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_OCCUPANCY_MODE = 9, + /** + * ATT collection max data size, in MB. Shared among shader engines. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE = 10, + /** + * Mask of which compute units to generate perfcounters. GFX9 only. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK = 240, + /** + * Select collection period for perfcounters. GFX9 only. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_CTRL = 241, + /** + * Select perfcounter ID (SQ block) for collection. GFX9 only. + */ + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_NAME = 242, +} hsa_ven_amd_aqlprofile_parameter_name_t; + +// Profile parameter object +typedef struct { + hsa_ven_amd_aqlprofile_parameter_name_t parameter_name; + uint32_t value; +} hsa_ven_amd_aqlprofile_parameter_t; + +typedef enum { + HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_0 = 0, + HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_1, + HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_2, + HSA_VEN_AMD_AQLPROFILE_ATT_CHANNEL_3 +} hsa_ven_amd_aqlprofile_att_marker_channel_t; + +// +// Profile context object: +// The library provides a profile object structure which contains +// the events array, a buffer for the profiling start/stop commands +// and a buffer for the output data. +// The buffers are specified by the buffer descriptors and allocated +// by the application. The buffers allocation attributes, the command +// buffer size, the PMC output buffer size as well as profiling output +// data can be get using the generic get profile info helper _get_info. +// +// Buffer descriptor +typedef struct { + void* ptr; + uint32_t size; +} hsa_ven_amd_aqlprofile_descriptor_t; + +// Profile context object structure, contains profiling events list and +// needed for profiling buffers descriptors, a command buffer and +// an output data buffer +typedef struct { + hsa_agent_t agent; // GFXIP handle + hsa_ven_amd_aqlprofile_event_type_t type; // Events type + const hsa_ven_amd_aqlprofile_event_t* events; // Events array + uint32_t event_count; // Events count + const hsa_ven_amd_aqlprofile_parameter_t* parameters; // Parameters array + uint32_t parameter_count; // Parameters count + hsa_ven_amd_aqlprofile_descriptor_t output_buffer; // Output buffer + hsa_ven_amd_aqlprofile_descriptor_t command_buffer; // PM4 commands +} hsa_ven_amd_aqlprofile_profile_t; + +// +// AQL packets populating methods: +// The helper methods to populate provided by the application START and +// STOP AQL packets which the application is required to submit before and +// after profiled GPU task packets respectively. +// +// AQL Vendor Specific packet which carries a PM4 command +typedef struct { + uint16_t header; + uint16_t pm4_command[27]; + hsa_signal_t completion_signal; +} hsa_ext_amd_aql_pm4_packet_t; + +// Method to populate the provided AQL packet with profiling start commands +// Only 'pm4_command' fields of the packet are set and the application +// is responsible to set Vendor Specific header type a completion signal +hsa_status_t hsa_ven_amd_aqlprofile_start( + hsa_ven_amd_aqlprofile_profile_t* profile, // [in,out] profile context object + hsa_ext_amd_aql_pm4_packet_t* aql_start_packet); // [out] profile start AQL packet + +// Method to populate the provided AQL packet with profiling stop commands +// Only 'pm4_command' fields of the packet are set and the application +// is responsible to set Vendor Specific header type and a completion signal +hsa_status_t hsa_ven_amd_aqlprofile_stop( + const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object + hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet); // [out] profile stop AQL packet + +// Method to populate the provided AQL packet with profiling read commands +// Only 'pm4_command' fields of the packet are set and the application +// is responsible to set Vendor Specific header type and a completion signal +hsa_status_t hsa_ven_amd_aqlprofile_read( + const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object + hsa_ext_amd_aql_pm4_packet_t* aql_read_packet); // [out] profile stop AQL packet + +// Legacy devices, PM4 profiling packet size +const unsigned HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE = 192; +// Legacy devices, converting the profiling AQL packet to PM4 packet blob +hsa_status_t hsa_ven_amd_aqlprofile_legacy_get_pm4( + const hsa_ext_amd_aql_pm4_packet_t* aql_packet, // [in] AQL packet + void* data); // [out] PM4 packet blob + +// Method to add a marker (correlation ID) into the ATT buffer. +hsa_status_t hsa_ven_amd_aqlprofile_att_marker( + hsa_ven_amd_aqlprofile_profile_t* profile, // [in,out] profile context object + hsa_ext_amd_aql_pm4_packet_t* aql_marker_packet, // [out] profile marker AQL packet + uint32_t data, // [in] Data to be inserted + hsa_ven_amd_aqlprofile_att_marker_channel_t channel); // [in] Comm channel + +// +// Get profile info: +// Generic method for getting various profile info including profile buffers +// attributes like the command buffer size and the profiling PMC results. +// It’s implied that all counters are 64bit values. +// +// Profile generic output data: +typedef struct { + uint32_t sample_id; // PMC sample or trace buffer index + union { + struct { + hsa_ven_amd_aqlprofile_event_t event; // PMC event + uint64_t result; // PMC result + } pmc_data; + hsa_ven_amd_aqlprofile_descriptor_t trace_data; // Trace output data descriptor + }; +} hsa_ven_amd_aqlprofile_info_data_t; + +// ID query type +typedef struct { + const char* name; + uint32_t id; + uint32_t instance_count; +} hsa_ven_amd_aqlprofile_id_query_t; + +// Profile attributes +typedef enum { + HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE = 0, // get_info returns uint32_t value + HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE = 1, // get_info returns uint32_t value + HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA = 2, // get_info returns PMC uint64_t value + // in info_data object + HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA = 3, // get_info returns trace buffer ptr/size + // in info_data object + HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS = 4, // get_info returns number of block counter + HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID = 5, // get_info returns block id, instances + // by name string using _id_query_t + HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD = 6, // get_info returns size/pointer for + // counters enable command buffer + HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD = 7, // get_info returns size/pointer for + // counters disable command buffer +} hsa_ven_amd_aqlprofile_info_type_t; + + +// Definition of output data iterator callback +typedef hsa_status_t (*hsa_ven_amd_aqlprofile_data_callback_t)( + hsa_ven_amd_aqlprofile_info_type_t info_type, // [in] data type, PMC or trace data + hsa_ven_amd_aqlprofile_info_data_t* info_data, // [in] info_data object + void* callback_data); // [in,out] data passed to the callback + +// Method for getting the profile info +hsa_status_t hsa_ven_amd_aqlprofile_get_info( + const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object + hsa_ven_amd_aqlprofile_info_type_t attribute, // [in] requested profile attribute + void* value); // [in,out] returned value + +// Method for iterating the events output data +hsa_status_t hsa_ven_amd_aqlprofile_iterate_data( + const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object + hsa_ven_amd_aqlprofile_data_callback_t callback, // [in] callback to iterate the output data + void* data); // [in,out] data passed to the callback + +// Return error string +hsa_status_t hsa_ven_amd_aqlprofile_error_string( + const char** str); // [out] pointer on the error string + +/** + * @brief Callback for iteration of all possible event coordinate IDs and coordinate names. + */ +typedef hsa_status_t(*hsa_ven_amd_aqlprofile_eventname_callback_t)(int id, const char* name); +/** + * @brief Iterate over all possible event coordinate IDs and their names. + */ +hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_ids(hsa_ven_amd_aqlprofile_eventname_callback_t); + +/** + * @brief Iterate over all event coordinates for a given agent_t and event_t. + * @param position A counting sequence indicating callback number. + * @param id Coordinate ID as in _iterate_event_ids. + * @param extent Coordinate extent indicating maximum allowed instances. + * @param coordinate The coordinate, in the range [0,extent-1]. + * @param name Coordinate name as in _iterate_event_ids. + * @param userdata Userdata returned from _iterate_event_coord function. + */ +typedef hsa_status_t(*hsa_ven_amd_aqlprofile_coordinate_callback_t)( + int position, + int id, + int extent, + int coordinate, + const char* name, + void* userdata +); + +/** + * @brief Iterate over all event coordinates for a given agent_t and event_t. + * @param[in] agent HSA agent. + * @param[in] event The event ID and block ID to iterate for. + * @param[in] sample_id aqlprofile_info_data_t.sample_id returned from _aqlprofile_iterate_data. + * @param[in] callback Callback function to return the coordinates. + * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback. + */ +hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_coord( + hsa_agent_t agent, + hsa_ven_amd_aqlprofile_event_t event, + uint32_t sample_id, + hsa_ven_amd_aqlprofile_coordinate_callback_t callback, + void* userdata +); + +/** + * @brief Extension version. + */ +#define hsa_ven_amd_aqlprofile_VERSION_MAJOR 1 +#define hsa_ven_amd_aqlprofile_LIB(suff) "libhsa-amd-aqlprofile" suff ".so" + +#ifdef HSA_LARGE_MODEL +static const char kAqlProfileLib[] = hsa_ven_amd_aqlprofile_LIB("64"); +#else +static const char kAqlProfileLib[] = hsa_ven_amd_aqlprofile_LIB(""); +#endif + +/** + * @brief Extension function table. + */ +typedef struct hsa_ven_amd_aqlprofile_1_00_pfn_s { + uint32_t (*hsa_ven_amd_aqlprofile_version_major)(); + uint32_t (*hsa_ven_amd_aqlprofile_version_minor)(); + + hsa_status_t (*hsa_ven_amd_aqlprofile_error_string)( + const char** str); + + hsa_status_t (*hsa_ven_amd_aqlprofile_validate_event)( + hsa_agent_t agent, + const hsa_ven_amd_aqlprofile_event_t* event, + bool* result); + + hsa_status_t (*hsa_ven_amd_aqlprofile_start)( + hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ext_amd_aql_pm4_packet_t* aql_start_packet); + + hsa_status_t (*hsa_ven_amd_aqlprofile_stop)( + const hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet); + + hsa_status_t (*hsa_ven_amd_aqlprofile_read)( + const hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ext_amd_aql_pm4_packet_t* aql_read_packet); + + hsa_status_t (*hsa_ven_amd_aqlprofile_legacy_get_pm4)( + const hsa_ext_amd_aql_pm4_packet_t* aql_packet, + void* data); + + hsa_status_t (*hsa_ven_amd_aqlprofile_get_info)( + const hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ven_amd_aqlprofile_info_type_t attribute, + void* value); + + hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_data)( + const hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ven_amd_aqlprofile_data_callback_t callback, + void* data); + + hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_event_ids)( + hsa_ven_amd_aqlprofile_eventname_callback_t + ); + + hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_event_coord)( + hsa_agent_t agent, + hsa_ven_amd_aqlprofile_event_t event, + uint32_t sample_id, + hsa_ven_amd_aqlprofile_coordinate_callback_t callback, + void* userdata + ); + + hsa_status_t (*hsa_ven_amd_aqlprofile_att_marker)( + hsa_ven_amd_aqlprofile_profile_t* profile, + hsa_ext_amd_aql_pm4_packet_t* aql_packet, + uint32_t data, + hsa_ven_amd_aqlprofile_att_marker_channel_t channel + ); +} hsa_ven_amd_aqlprofile_1_00_pfn_t; + +typedef hsa_ven_amd_aqlprofile_1_00_pfn_t hsa_ven_amd_aqlprofile_pfn_t; + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_ diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_loader.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_loader.h new file mode 100644 index 0000000000..47236c86e9 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_loader.h @@ -0,0 +1,667 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA AMD extension for additional loader functionality. + +#ifndef HSA_VEN_AMD_LOADER_H +#define HSA_VEN_AMD_LOADER_H + +#include "hsa.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** + * @brief Queries equivalent host address for given @p device_address, and + * records it in @p host_address. + * + * + * @details Contents of memory pointed to by @p host_address would be identical + * to contents of memory pointed to by @p device_address. Only difference + * between the two is host accessibility: @p host_address is always accessible + * from host, @p device_address might not be accessible from host. + * + * If @p device_address already points to host accessible memory, then the value + * of @p device_address is simply copied into @p host_address. + * + * The lifetime of @p host_address is the same as the lifetime of @p + * device_address, and both lifetimes are limited by the lifetime of the + * executable that is managing these addresses. + * + * + * @param[in] device_address Device address to query equivalent host address + * for. + * + * @param[out] host_address Pointer to application-allocated buffer to record + * queried equivalent host address in. + * + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p device_address is invalid or + * null, or @p host_address is null. + */ +hsa_status_t hsa_ven_amd_loader_query_host_address( + const void *device_address, + const void **host_address); + +/** + * @brief The storage type of the code object that is backing loaded memory + * segment. + */ +typedef enum { + /** + * Loaded memory segment is not backed by any code object (anonymous), as the + * case would be with BSS (uninitialized data). + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE = 0, + /** + * Loaded memory segment is backed by the code object that is stored in the + * file. + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE = 1, + /** + * Loaded memory segment is backed by the code object that is stored in the + * memory. + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY = 2 +} hsa_ven_amd_loader_code_object_storage_type_t; + +/** + * @brief Loaded memory segment descriptor. + * + * + * @details Loaded memory segment descriptor describes underlying loaded memory + * segment. Loaded memory segment is created/allocated by the executable during + * the loading of the code object that is backing underlying memory segment. + * + * The lifetime of underlying memory segment is limited by the lifetime of the + * executable that is managing underlying memory segment. + */ +typedef struct hsa_ven_amd_loader_segment_descriptor_s { + /** + * Agent underlying memory segment is allocated on. If the code object that is + * backing underlying memory segment is program code object, then 0. + */ + hsa_agent_t agent; + /** + * Executable that is managing this underlying memory segment. + */ + hsa_executable_t executable; + /** + * Storage type of the code object that is backing underlying memory segment. + */ + hsa_ven_amd_loader_code_object_storage_type_t code_object_storage_type; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then null; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then null-terminated + * filepath to the code object; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then host + * accessible pointer to the first byte of the code object. + */ + const void *code_object_storage_base; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then the length of + * the filepath to the code object (including null-terminating character); + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then the size, in + * bytes, of the memory occupied by the code object. + */ + size_t code_object_storage_size; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0; + * - other, then offset, in bytes, from the beginning of the code object to + * the first byte in the code object data is copied from. + */ + size_t code_object_storage_offset; + /** + * Starting address of the underlying memory segment. + */ + const void *segment_base; + /** + * Size, in bytes, of the underlying memory segment. + */ + size_t segment_size; +} hsa_ven_amd_loader_segment_descriptor_t; + +/** + * @brief Either queries loaded memory segment descriptors, or total number of + * loaded memory segment descriptors. + * + * + * @details If @p segment_descriptors is not null and @p num_segment_descriptors + * points to number that exactly matches total number of loaded memory segment + * descriptors, then queries loaded memory segment descriptors, and records them + * in @p segment_descriptors. If @p segment_descriptors is null and @p + * num_segment_descriptors points to zero, then queries total number of loaded + * memory segment descriptors, and records it in @p num_segment_descriptors. In + * all other cases returns appropriate error code (see below). + * + * The caller of this function is responsible for the allocation/deallocation + * and the lifetime of @p segment_descriptors and @p num_segment_descriptors. + * + * The lifetime of loaded memory segments that are described by queried loaded + * memory segment descriptors is limited by the lifetime of the executable that + * is managing loaded memory segments. + * + * Queried loaded memory segment descriptors are always self-consistent: they + * describe a complete set of loaded memory segments that are being backed by + * fully loaded code objects that are present at the time (i.e. this function + * is blocked until all executable manipulations are fully complete). + * + * + * @param[out] segment_descriptors Pointer to application-allocated buffer to + * record queried loaded memory segment descriptors in. Can be null if @p + * num_segment_descriptors points to zero. + * + * @param[in,out] num_segment_descriptors Pointer to application-allocated + * buffer that contains either total number of loaded memory segment descriptors + * or zero. + * + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p segment_descriptors is null + * while @p num_segment_descriptors points to non-zero number, @p + * segment_descriptors is not null while @p num_segment_descriptors points to + * zero, or @p num_segment_descriptors is null. + * + * @retval HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p num_segment_descriptors + * does not point to number that exactly matches total number of loaded memory + * segment descriptors. + */ +hsa_status_t hsa_ven_amd_loader_query_segment_descriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + +/** + * @brief Obtains the handle of executable to which the device address belongs. + * + * @details This method should not be used to obtain executable handle by using + * a host address. The executable returned is expected to be alive until its + * destroyed by the user. + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT The input is invalid or there + * is no exectuable found for this kernel code object. + */ +hsa_status_t hsa_ven_amd_loader_query_executable( + const void *device_address, + hsa_executable_t *executable); + +//===----------------------------------------------------------------------===// + +/** + * @brief Iterate over the loaded code objects in an executable, and invoke + * an application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per loaded code object. The + * HSA runtime passes three arguments to the callback: the executable, a + * loaded code object, and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and + * ::hsa_ven_amd_loader_executable_iterate_loaded_code_objects returns that + * status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + +/** + * @brief Loaded code object kind. + */ +typedef enum { + /** + * Program code object. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_PROGRAM = 1, + /** + * Agent code object. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT = 2 +} hsa_ven_amd_loader_loaded_code_object_kind_t; + +/** + * @brief Loaded code object attributes. + */ +typedef enum hsa_ven_amd_loader_loaded_code_object_info_e { + /** + * The executable in which this loaded code object is loaded. The + * type of this attribute is ::hsa_executable_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_EXECUTABLE = 1, + /** + * The kind of this loaded code object. The type of this attribute is + * ::uint32_t interpreted as ::hsa_ven_amd_loader_loaded_code_object_kind_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND = 2, + /** + * The agent on which this loaded code object is loaded. The + * value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND is + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT. The type of this + * attribute is ::hsa_agent_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_AGENT = 3, + /** + * The storage type of the code object reader used to load the loaded code object. + * The type of this attribute is ::uint32_t interpreted as a + * ::hsa_ven_amd_loader_code_object_storage_type_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE = 4, + /** + * The memory address of the first byte of the code object that was loaaded. + * The value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is + * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this + * attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_BASE = 5, + /** + * The memory size in bytes of the code object that was loaaded. + * The value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is + * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this + * attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_SIZE = 6, + /** + * The file descriptor of the code object that was loaaded. + * The value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is + * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE. The type of this + * attribute is ::int. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_FILE = 7, + /** + * The signed byte address difference of the memory address at which the code + * object is loaded minus the virtual address specified in the code object + * that is loaded. The value of this attribute is only defined if the + * executable in which the code object is loaded is froozen. The type of this + * attribute is ::int64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA = 8, + /** + * The base memory address at which the code object is loaded. This is the + * base address of the allocation for the lowest addressed segment of the code + * object that is loaded. Note that any non-loaded segments before the first + * loaded segment are ignored. The value of this attribute is only defined if + * the executable in which the code object is loaded is froozen. The type of + * this attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE = 9, + /** + * The byte size of the loaded code objects contiguous memory allocation. The + * value of this attribute is only defined if the executable in which the code + * object is loaded is froozen. The type of this attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE = 10, + /** + * The length of the URI in bytes, not including the NUL terminator. The type + * of this attribute is uint32_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH = 11, + /** + * The URI name from which the code object was loaded. The type of this + * attribute is a NUL terminated \p char* with the length equal to the value + * of ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH attribute. + * The URI name syntax is defined by the following BNF syntax: + * + * code_object_uri ::== file_uri | memory_uri + * file_uri ::== "file://" file_path [ range_specifier ] + * memory_uri ::== "memory://" process_id range_specifier + * range_specifier ::== [ "#" | "?" ] "offset=" number "&" "size=" number + * file_path ::== URI_ENCODED_OS_FILE_PATH + * process_id ::== DECIMAL_NUMBER + * number ::== HEX_NUMBER | DECIMAL_NUMBER | OCTAL_NUMBER + * + * ``number`` is a C integral literal where hexadecimal values are prefixed by + * "0x" or "0X", and octal values by "0". + * + * ``file_path`` is the file's path specified as a URI encoded UTF-8 string. + * In URI encoding, every character that is not in the regular expression + * ``[a-zA-Z0-9/_.~-]`` is encoded as two uppercase hexidecimal digits + * proceeded by "%". Directories in the path are separated by "/". + * + * ``offset`` is a 0-based byte offset to the start of the code object. For a + * file URI, it is from the start of the file specified by the ``file_path``, + * and if omitted defaults to 0. For a memory URI, it is the memory address + * and is required. + * + * ``size`` is the number of bytes in the code object. For a file URI, if + * omitted it defaults to the size of the file. It is required for a memory + * URI. + * + * ``process_id`` is the identity of the process owning the memory. For Linux + * it is the C unsigned integral decimal literal for the process ID (PID). + * + * For example: + * + * file:///dir1/dir2/file1 + * file:///dir3/dir4/file2#offset=0x2000&size=3000 + * memory://1234#offset=0x20000&size=3000 + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI = 12, +} hsa_ven_amd_loader_loaded_code_object_info_t; + +/** + * @brief Get the current value of an attribute for a given loaded code + * object. + * + * @param[in] loaded_code_object Loaded code object. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The loaded code object is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * loaded code object attribute, or @p value is NULL. + */ +hsa_status_t hsa_ven_amd_loader_loaded_code_object_get_info( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + +//===----------------------------------------------------------------------===// + +/** + * @brief Create a code object reader to operate on a file with size and offset. + * + * @param[in] file File descriptor. The file must have been opened by + * application with at least read permissions prior calling this function. The + * file must contain a vendor-specific code object. + * + * The file is owned and managed by the application; the lifetime of the file + * descriptor must exceed that of any associated code object reader. + * + * @param[in] size Size of the code object embedded in @p file. + * + * @param[in] offset 0-based offset relative to the beginning of the @p file + * that denotes the beginning of the code object embedded within the @p file. + * + * @param[out] code_object_reader Memory location to store the newly created + * code object reader handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is not opened with at least + * read permissions. This condition may also be reported as + * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER by the + * ::hsa_executable_load_agent_code_object function. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The bytes starting at offset + * do not form a valid code object. If file size is 0. Or offset > file size. + * This condition may also be reported as + * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT by the + * ::hsa_executable_load_agent_code_object function. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL. + */ +hsa_status_t +hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); + +//===----------------------------------------------------------------------===// + +/** + * @brief Iterate over the available executables, and invoke an + * application-defined callback on every iteration. While + * ::hsa_ven_amd_loader_iterate_executables is executing any calls to + * ::hsa_executable_create, ::hsa_executable_create_alt, or + * ::hsa_executable_destroy will be blocked. + * + * @param[in] callback Callback to be invoked once per executable. The HSA + * runtime passes two arguments to the callback: the executable and the + * application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_ven_amd_loader_iterate_executables returns that status value. If + * @p callback invokes ::hsa_executable_create, ::hsa_executable_create_alt, or + * ::hsa_executable_destroy then the behavior is undefined. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. +*/ +hsa_status_t +hsa_ven_amd_loader_iterate_executables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); + +//===----------------------------------------------------------------------===// + +/** + * @brief Extension version. + */ +#define hsa_ven_amd_loader 001003 + +/** + * @brief Extension function table version 1.00. + */ +typedef struct hsa_ven_amd_loader_1_00_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); +} hsa_ven_amd_loader_1_00_pfn_t; + +/** + * @brief Extension function table version 1.01. + */ +typedef struct hsa_ven_amd_loader_1_01_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); +} hsa_ven_amd_loader_1_01_pfn_t; + +/** + * @brief Extension function table version 1.02. + */ +typedef struct hsa_ven_amd_loader_1_02_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + + hsa_status_t + (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); +} hsa_ven_amd_loader_1_02_pfn_t; + +/** + * @brief Extension function table version 1.03. + */ +typedef struct hsa_ven_amd_loader_1_03_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + + hsa_status_t + (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); + + hsa_status_t + (*hsa_ven_amd_loader_iterate_executables)( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); +} hsa_ven_amd_loader_1_03_pfn_t; + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* HSA_VEN_AMD_LOADER_H */ diff --git a/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_pc_sampling.h b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_pc_sampling.h new file mode 100644 index 0000000000..019f0ea5c9 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/hsa/hsa_ven_amd_pc_sampling.h @@ -0,0 +1,416 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_VEN_AMD_PC_SAMPLING_H +#define HSA_VEN_AMD_PC_SAMPLING_H + +#include "hsa.h" + +#ifdef __cplusplus +extern "C" { +#endif /*__cplusplus*/ + + +/** + * @brief HSA AMD Vendor PC Sampling APIs + * EXPERIMENTAL: All PC Sampling APIs are currently in an experimental phase and the APIs may be + * modified extensively in the future + */ + +/** + * @brief PC Sampling sample data for hosttrap sampling method + */ +typedef struct { + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroup_id_x; + uint32_t workgroup_id_y; + uint32_t workgroup_id_z; + uint32_t wave_in_wg : 6; + uint32_t chiplet : 3; // Currently not used + uint32_t reserved : 23; + uint32_t hw_id; + uint32_t reserved0; + uint64_t reserved1; + uint64_t timestamp; + uint64_t correlation_id; +} perf_sample_hosttrap_v1_t; + +/** + * @brief PC Sampling sample data for stochastic sampling method + */ +typedef struct { + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroup_id_x; + uint32_t workgroup_id_y; + uint32_t workgroup_id_z; + uint32_t wave_in_wg : 6; + uint32_t chiplet : 3; // Currently not used + uint32_t reserved : 23; + uint32_t hw_id; + uint32_t perf_snapshot_data; + uint32_t perf_snapshot_data1; + uint32_t perf_snapshot_data2; + uint64_t timestamp; + uint64_t correlation_id; +} perf_sample_snapshot_v1_t; + +/** + * @brief PC Sampling method kinds + */ +typedef enum { + HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1, + HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1 +} hsa_ven_amd_pcs_method_kind_t; + +/** + * @brief PC Sampling interval unit type + */ +typedef enum { + HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS, + HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES, + HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS +} hsa_ven_amd_pcs_units_t; + +/** + * @brief HSA callback function to perform the copy onto a destination buffer + * + * If data_size is 0, HSA will stop current copy operation and keep remaining data in internal + * buffers. Remaining contents of HSA internal buffers will be included in next + * hsa_ven_amd_pcs_data_ready_callback_t. HSA internal buffers can also be drained by calling + * hsa_ven_amd_pcs_flush. + * + * @param[in] hsa_callback_data private data to pass back to HSA. Provided in + * hsa_ven_amd_pcs_data_ready_callback_t + * + * @param[in] data_size size of destination buffer in bytes. + * @param[in] destination destination buffer + * @retval TBD: but could be used to indicate that there is no more data to be read. + * Or indicate an error and abort of current copy operations + */ +typedef hsa_status_t (*hsa_ven_amd_pcs_data_copy_callback_t)(void* hsa_callback_data, + size_t data_size, void* destination); + +/** + * @brief HSA callback function to to indicate that there is data ready to be copied + * + * When the client receives this callback, the client should call back @p data_copy_callback for HSA + * to perform the copy operation into an available buffer. @p data_copy_callback can be called back + * multiple times with smaller @p data_size to split the copy operation. + * + * This callback must not call ::hsa_ven_amd_pcs_flush. + * + * @param[in] client_callback_data client private data passed in via + * hsa_ven_amd_pcs_create/hsa_ven_amd_pcs_create_from_id + * @param[in] data_size size of data available to be copied + * @param[in] lost_sample_count number of lost samples since last call to + * hsa_ven_amd_pcs_data_ready_callback_t. + * @param[in] data_copy_callback callback function for HSA to perform the actual copy + * @param[in] hsa_callback_data private data to pass back to HSA + */ +typedef void (*hsa_ven_amd_pcs_data_ready_callback_t)( + void* client_callback_data, size_t data_size, size_t lost_sample_count, + hsa_ven_amd_pcs_data_copy_callback_t data_copy_callback, void* hsa_callback_data); + +/** + * @brief Opaque handle representing a sampling session. + * Two sessions having same handle value represent the same session + */ +typedef struct { + uint64_t handle; +} hsa_ven_amd_pcs_t; + +/** + * @brief PC Sampling configuration flag options + */ +typedef enum { + /* The interval for this sampling method have to be a power of 2 */ + HSA_VEN_AMD_PCS_CONFIGURATION_FLAGS_INTERVAL_POWER_OF_2 = (1 << 0) +} hsa_ven_amd_pcs_configuration_flags_t; + +/** + * @brief PC Sampling method information + * Used to provide client with list of supported PC Sampling methods + */ +typedef struct { + hsa_ven_amd_pcs_method_kind_t method; + hsa_ven_amd_pcs_units_t units; + size_t min_interval; + size_t max_interval; + uint64_t flags; +} hsa_ven_amd_pcs_configuration_t; + +/** + * @brief Callback function to iterate through list of supported PC Sampling configurations + * + * @param[in] configuration one entry for supported PC Sampling method and configuration options + * @param[in] callback_data client private callback data that was passed in when calling + * hsa_ven_amd_pcs_iterate_configuration + */ +typedef hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration_callback_t)( + const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data); + +/** + * @brief Iterate through list of current supported PC Sampling configurations for this @p agent + * + * HSA will callback @p configuration_callback for each currently available PC Sampling + * configuration. The list of currently available configurations may not be the complete list of + * configurations supported on the @p agent. The list of currently available configurations may be + * reduced if the @p agent is currently handling other PC sampling sessions. + * + * @param[in] agent target agent + * @param[in] configuration_callback callback function to iterate through list of configurations + * @param[in] callback_data client private callback data + **/ +hsa_status_t hsa_ven_amd_pcs_iterate_configuration( + hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data); + +/** + * @brief Create a PC Sampling session on @p agent + * + * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval + * parameters must be a legal configuration value, as described by the + * hsa_ven_amd_pcs_configuration_t configurations passed to the callbacks of + * hsa_ven_amd_pcs_iterate_configuration for this @p agent. + * A successfull call may restrict the list of possible PC sampling methods available to subsequent + * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations + * on what types of PC sampling they can perform concurrently. + * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session. + * The session will be in a stopped/inactive state after this call + * + * @param[in] agent target agent + * @param[in] method method to use + * @param[in] units sampling units + * @param[in] interval sampling interval in @p units + * @param[in] latency expected latency in microseconds for client to provide a buffer for the data + * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the + * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate + * how many samples are received within @p latency and call @p data_ready_callback ahead of time so + * that the client has @p latency time to allocate the buffer before the HSA-runtime internal + * buffers are full. The value of latency can be 0. + * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once + * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of + * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t. + * @param[in] data_ready_callback client callback function that will be called when: + * 1. There is enough samples fill a buffer with @p buffer_size - estimated samples received + * within @p latency period. + * OR + * 2. When hsa_ven_amd_pcs_flush is called. + * @param[in] client_callback_data client private data to be provided back when data_ready_callback + * is called. + * @param[out] pc_sampling PC sampling session handle used to reference this session when calling + * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy + * + * @retval ::HSA_STATUS_SUCCESS session created successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters + * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and + * cannot handle the type requested. + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources + * @retval ::HSA_STATUS_ERROR Unexpected error + **/ +hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling); + + +/** + * @brief Creates a PC Sampling session on @p agent. Assumes that the caller provides the + * @p pcs_id generated by the previous call to the underlying driver that reserved PC sampling + * on the @p agent. + * + * Similar to the @ref hsa_ven_amd_pcs_create with the difference that it inherits an existing + * PC sampling session that was previously created in the underlying driver. + * + * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval + * parameters must be a legal configuration value, and match the parameters that we used to create + * the underlying PC Sampling session in the underlying driver. + * A successfull call may restrict the list of possible PC sampling methods available to subsequent + * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations + * on what types of PC sampling they can perform concurrently. + * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session. + * The session will be in a stopped/inactive state after this call + * + * @param[in] pcs_id ID that uniquely identifies the PC sampling session within underlying driver + * @param[in] agent target agent + * @param[in] method method to use + * @param[in] units sampling units + * @param[in] interval sampling interval in @p units + * @param[in] latency expected latency in microseconds for client to provide a buffer for the data + * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the + * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate + * how many samples are received within @p latency and call @p data_ready_callback ahead of time so + * that the client has @p latency time to allocate the buffer before the HSA-runtime internal + * buffers are full. The value of latency can be 0. + * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once + * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of + * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t. + * @param[in] data_ready_callback client callback function that will be called when: + * 1. There is enough samples fill a buffer with @p buffer_size - estimated samples received + * within @p latency period. + * OR + * 2. When hsa_ven_amd_pcs_flush is called. + * @param[in] client_callback_data client private data to be provided back when data_ready_callback + * is called. + * @param[out] pc_sampling PC sampling session handle used to reference this session when calling + * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy + * + * @retval ::HSA_STATUS_SUCCESS session created successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters + * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and + * cannot handle the type requested. + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources + * @retval ::HSA_STATUS_ERROR Unexpected error + **/ +hsa_status_t hsa_ven_amd_pcs_create_from_id( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + +/** + * @brief Free a PC Sampling session on @p agent + * + * Free all the resources allocated for a PC Sampling session on @p agent + * Internal buffers for this session will be lost. + * If the session was active, the session will be stopped before it is destroyed. + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session destroyed successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + * @retval ::HSA_STATUS_ERROR unexpected error + */ +hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Start a PC Sampling session + * + * Activate a PC Sampling session that was previous created. + * The session with be in a active state after this call + * If the session was already active, this will result in a no-op and will return HSA_STATUS_SUCCESS + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session started successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + * @retval ::HSA_STATUS_ERROR unexpected error + */ +hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Stop a PC Sampling session + * + * Stop a session that is currently active + * After a session is stopped HSA may still have some PC Sampling data in its internal buffers. + * The internal buffers can be drained using hsa_ven_amd_pcs_flush. If the internal + * buffers are not drained and the session is started again, the internal buffers will be available + * on the next data_ready_callback. + * If the session was already inactive, this will result in a no-op and will return + * HSA_STATUS_SUCCESS + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session stopped successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + */ +hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Flush internal buffers for a PC Sampling session + * + * Drain internal buffers for a PC Sampling session. If internal buffers have available data, + * this trigger a data_ready_callback. + * + * The function blocks until all PC samples associated with the @p pc_sampling session + * generated prior to the function call have been communicated by invocations of + * @p data_ready_callback having completed execution. + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session flushed successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + */ +hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling); + +#define hsa_ven_amd_pc_sampling_1_00 + +/** + * @brief The function pointer table for the PC Sampling v1.00 extension. Can be returned by + * ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ven_amd_pc_sampling_1_00_pfn_t { + hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration)( + hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data); + + hsa_status_t (*hsa_ven_amd_pcs_create)(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_create_from_id)( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_destroy)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_start)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_stop)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_flush)(hsa_ven_amd_pcs_t pc_sampling); + +} hsa_ven_amd_pc_sampling_1_00_pfn_t; + +#ifdef __cplusplus +} // end extern "C" block +#endif /*__cplusplus*/ + +#endif /* HSA_VEN_AMD_PC_SAMPLING_H */ diff --git a/projects/rocr-runtime/libhsakmt/include/impl/pm4_cmds.h b/projects/rocr-runtime/libhsakmt/include/impl/pm4_cmds.h new file mode 100644 index 0000000000..44b7fb00aa --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/pm4_cmds.h @@ -0,0 +1,1090 @@ +#ifndef _WSL_INC_PM4_CMDS_H_ +#define _WSL_INC_PM4_CMDS_H_ + +#include + +#define mmCOMPUTE_NUM_THREAD_X 0x2E07 +#define mmCOMPUTE_PGM_LO 0x2E0C +#define mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO 0x2E10 +#define mmCOMPUTE_PGM_RSRC1 0x2E12 +#define mmCOMPUTE_PGM_RSRC3 0x2E28 +#define mmCOMPUTE_RESOURCE_LIMITS 0x2E15 +#define mmCOMPUTE_USER_DATA_0 0x2E40 + +#define PM4_TYPE_SHIFT 30 +#define PM4_COUNT_SHIFT 16 +#define PM4_OPCODE_SHIFT 8 +#define PM4_SHADER_TYPE_SHIFT 1 + +#define PM4_GFX_SHADER 0 +#define PM4_COMPUTE_SHADER 1 + +#define PM4_TYPE3_HDR(_opc_, _count_) \ + (uint32_t)((3) << PM4_TYPE_SHIFT | \ + ((_count_) - 2) << PM4_COUNT_SHIFT | \ + (_opc_) << PM4_OPCODE_SHIFT) | \ + (PM4_COMPUTE_SHADER << PM4_SHADER_TYPE_SHIFT) + +union PM4_MEC_TYPE_3_HEADER { + struct { + uint32_t reserved1 : 8; ///< reserved + uint32_t opcode : 8; ///< IT opcode + uint32_t count : 14;///< number of DWORDs - 1 in the information body. + uint32_t type : 2; ///< packet identifier. It should be 3 for type 3 packets + }; + uint32_t u32All; +}; + +#define IT_DISPATCH_DIRECT 0x15 +#define IT_ATOMIC_MEM 0x1E +#define IT_WRITE_DATA 0x37 +#define IT_INDIRECT_BUFFER 0x3F +#define IT_COPY_DATA 0x40 +#define IT_EVENT_WRITE 0x46 +#define IT_RELEASE_MEM 0x49 +#define IT_ACQUIRE_MEM 0x58 +#define IT_SET_SH_REG 0x76 + +struct PM4_MEC_SET_SH_REG { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t reg_offset:16; + uint32_t reserved1:16; + } bitfields2; + uint32_t ordinal2; + }; +}; + +struct PM4_MEC_DISPATCH_DIRECT { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + uint32_t dim_x; + uint32_t dim_y; + uint32_t dim_z; + uint32_t dispatch_initiator; +}; + +// ------------------------------- MEC_EVENT_WRITE_event_index_enum ------------------------------- +enum MEC_EVENT_WRITE_event_index_enum { + event_index__mec_event_write__other = 0, + event_index__mec_event_write__sample_pipelinestat = 2, + event_index__mec_event_write__cs_partial_flush = 4, + event_index__mec_event_write__sample_streamoutstats__GFX11 = 8, + event_index__mec_event_write__sample_streamoutstats1__GFX11 = 9, + event_index__mec_event_write__sample_streamoutstats2__GFX11 = 10, + event_index__mec_event_write__sample_streamoutstats3__GFX11 = 11, +}; + +enum VGT_EVENT_TYPE { + Reserved_0x00 = 0x00000000, + SAMPLE_STREAMOUTSTATS1 = 0x00000001, + SAMPLE_STREAMOUTSTATS2 = 0x00000002, + SAMPLE_STREAMOUTSTATS3 = 0x00000003, + CACHE_FLUSH_TS = 0x00000004, + CONTEXT_DONE = 0x00000005, + CACHE_FLUSH = 0x00000006, + CS_PARTIAL_FLUSH = 0x00000007, + VGT_STREAMOUT_SYNC = 0x00000008, + VGT_STREAMOUT_RESET = 0x0000000a, + END_OF_PIPE_INCR_DE = 0x0000000b, + END_OF_PIPE_IB_END = 0x0000000c, + RST_PIX_CNT = 0x0000000d, + BREAK_BATCH = 0x0000000e, + VS_PARTIAL_FLUSH = 0x0000000f, + PS_PARTIAL_FLUSH = 0x00000010, + FLUSH_HS_OUTPUT = 0x00000011, + FLUSH_DFSM = 0x00000012, + RESET_TO_LOWEST_VGT = 0x00000013, + CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014, + CACHE_FLUSH_AND_INV_EVENT = 0x00000016, + PERFCOUNTER_START = 0x00000017, + PERFCOUNTER_STOP = 0x00000018, + PIPELINESTAT_START = 0x00000019, + PIPELINESTAT_STOP = 0x0000001a, + PERFCOUNTER_SAMPLE = 0x0000001b, + SAMPLE_PIPELINESTAT = 0x0000001e, + SO_VGTSTREAMOUT_FLUSH = 0x0000001f, + SAMPLE_STREAMOUTSTATS = 0x00000020, + RESET_VTX_CNT = 0x00000021, + BLOCK_CONTEXT_DONE = 0x00000022, + CS_CONTEXT_DONE = 0x00000023, + VGT_FLUSH = 0x00000024, + TGID_ROLLOVER = 0x00000025, + SQ_NON_EVENT = 0x00000026, + SC_SEND_DB_VPZ = 0x00000027, + BOTTOM_OF_PIPE_TS = 0x00000028, + FLUSH_SX_TS = 0x00000029, + DB_CACHE_FLUSH_AND_INV = 0x0000002a, + FLUSH_AND_INV_DB_DATA_TS = 0x0000002b, + FLUSH_AND_INV_DB_META = 0x0000002c, + FLUSH_AND_INV_CB_DATA_TS = 0x0000002d, + FLUSH_AND_INV_CB_META = 0x0000002e, + CS_DONE = 0x0000002f, + PS_DONE = 0x00000030, + FLUSH_AND_INV_CB_PIXEL_DATA = 0x00000031, + SX_CB_RAT_ACK_REQUEST = 0x00000032, + THREAD_TRACE_START = 0x00000033, + THREAD_TRACE_STOP = 0x00000034, + THREAD_TRACE_MARKER = 0x00000035, + THREAD_TRACE_FINISH = 0x00000037, + PIXEL_PIPE_STAT_CONTROL = 0x00000038, + PIXEL_PIPE_STAT_DUMP = 0x00000039, + PIXEL_PIPE_STAT_RESET = 0x0000003a, + CONTEXT_SUSPEND = 0x0000003b, + OFFCHIP_HS_DEALLOC = 0x0000003c, + ENABLE_NGG_PIPELINE = 0x0000003d, + SET_FE_ID__GFX09 = 0x00000009, + Available_0x1c__GFX09 = 0x0000001c, + Available_0x1d__GFX09 = 0x0000001d, + THREAD_TRACE_FLUSH__GFX09 = 0x00000036, + Reserved_0x3f__GFX09 = 0x0000003f, + ZPASS_DONE__GFX09_10 = 0x00000015, + ENABLE_LEGACY_PIPELINE__GFX09_10 = 0x0000003e, + Reserved_0x09__GFX10PLUS = 0x00000009, + FLUSH_ES_OUTPUT__GFX10PLUS = 0x0000001c, + BIN_CONF_OVERRIDE_CHECK__GFX10PLUS = 0x0000001d, + THREAD_TRACE_DRAW__GFX10PLUS = 0x00000036, + DRAW_DONE__GFX10PLUS = 0x0000003f, + WAIT_SYNC__GFX11 = 0x00000015, + ENABLE_PIPELINE_NOT_USED__GFX11 = 0x0000003e, +}; + +struct PM4_MEC_EVENT_WRITE { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t reserved2:19; + uint32_t offload_enable:1; + } bitfields2; + uint32_t ordinal2; + }; +}; + +struct PM4_MEC_ATOMIC_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t atomic:7; + uint32_t reserved1:1; + uint32_t command:4; + uint32_t reserved2:13; + uint32_t cache_policy:2; + uint32_t reserved3:5; + } bitfields2; + uint32_t ordinal2; + }; + uint32_t addr_lo; + uint32_t addr_hi; + uint32_t src_data_lo; + uint32_t src_data_hi; + uint32_t cmp_data_lo; + uint32_t cmp_data_hi; + union { + struct { + uint32_t loop_interval:13; + uint32_t reserved4:19; + } bitfields9; + uint32_t ordinal9; + }; +}; + +struct PM4_MEC_WRITE_DATA { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t reserved1:8; + uint32_t dst_sel:4; + uint32_t reserved2:4; + uint32_t addr_incr:1; + uint32_t reserved3:2; + uint32_t resume_vf:1; + uint32_t wr_confirm:1; + uint32_t reserved4:4; + uint32_t cache_policy:2; + uint32_t reserved5:5; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t dst_mmreg_addr:18; + uint32_t reserved6:14; + } bitfields3a; + struct { + uint32_t dst_gds_addr:16; + uint32_t reserved7:16; + } bitfields3b; + struct { + uint32_t reserved8:2; + uint32_t dst_mem_addr_lo:30; + } bitfields3c; + uint32_t ordinal3; + }; + uint32_t dst_mem_addr_hi; + uint64_t write_data_value; +}; + +#define PERSISTENT_SPACE_START 0x00002c00 + +template +void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) { + pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_SH_REG, + sizeof(T) / sizeof(uint32_t)); + pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - PERSISTENT_SPACE_START; +} + +template +void GenerateCmdHeader(T* pm4, int op_code) { + pm4->header.u32All = PM4_TYPE3_HDR(op_code, sizeof(T) / sizeof(uint32_t)); +} + +/// @brief Defines the Gpu command to dispatch a kernel. It embeds +/// various Gpu hardware specific data structures for initialization +/// and configuration before a dispatch begins to run +struct DispatchTemplate { + + /// @brief Structure used to initialize the group dimensions + /// of a kernel dispatch and if performance counters are enabled + struct DispatchDimensionRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_num_thread_x; + uint32_t compute_num_thread_y; + uint32_t compute_num_thread_z; + } dimension_regs; + + struct DispatchProgramRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_pgm_lo; + uint32_t compute_pgm_hi; + } program_regs; + + struct DispatchProgramResourceRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_pgm_rsrc1; + uint32_t compute_pgm_rsrc2; + } program_resource_regs; + + /// @brief Structure used to initialize parameters related to + /// thread management i.e. number of waves to issue and number + /// of Compute Units to use + struct DispatchResourceRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_resource_limits; + uint32_t compute_static_thread_mgmt_se0; + uint32_t compute_static_thread_mgmt_se1; + uint32_t compute_tmpring_size; + uint32_t compute_static_thread_mgmt_se2; + uint32_t compute_static_thread_mgmt_se3; + } resource_regs; + + /// @brief Structure used to pass handles of the Aql dispatch + /// packet, Aql queue, Kernel argument address block, Scratch + /// buffer + struct DispatchComputeUserDataRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_user_data[16]; + } compute_user_data_regs; + + /// @brief Structure used to configure Cache flush policy + /// and dimensions of total work size + PM4_MEC_DISPATCH_DIRECT dispatch_direct; +}; + +struct DispatchProgramResourceRegs { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t compute_pgm_rsrc3; +}; + + +/// @brief Structure used to issue a programing scratch command for gfx11+ +struct SetScratchTemplate { + PM4_MEC_SET_SH_REG cmd_set_data; + uint32_t scratch_lo; + uint32_t scratch_hi; +}; + +/// @brief Structure used to issue a Gpu Barrier command +struct BarrierTemplate { + PM4_MEC_EVENT_WRITE event_write; +}; + +//--------------------MEC_ATOMIC_MEM-------------------- +enum MEC_ATOMIC_MEM_command_enum { + command__mec_atomic_mem__single_pass_atomic = 0, + command__mec_atomic_mem__loop_until_compare_satisfied = 1, + command__mec_atomic_mem__wait_for_write_confirmation = 2, + command__mec_atomic_mem__send_and_continue = 3, +}; + +enum MEC_ATOMIC_MEM_cache_policy_enum { + cache_policy__mec_atomic_mem__lru = 0, + cache_policy__mec_atomic_mem__stream = 1, + cache_policy__mec_atomic_mem__noa = 2, + cache_policy__mec_atomic_mem__bypass = 3, +}; + +enum TC_OP { + TC_OP_READ = 0x00000000, + TC_OP_ATOMIC_FCMPSWAP_RTN_32 = 0x00000001, + TC_OP_ATOMIC_FMIN_RTN_32 = 0x00000002, + TC_OP_ATOMIC_FMAX_RTN_32 = 0x00000003, + TC_OP_RESERVED_FOP_RTN_32_0 = 0x00000004, + TC_OP_RESERVED_FOP_RTN_32_2 = 0x00000006, + TC_OP_ATOMIC_SWAP_RTN_32 = 0x00000007, + TC_OP_ATOMIC_CMPSWAP_RTN_32 = 0x00000008, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_RTN_32 = 0x00000009, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_RTN_32 = 0x0000000a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_RTN_32 = 0x0000000b, + TC_OP_PROBE_FILTER = 0x0000000c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_32_2 = 0x0000000e, + TC_OP_ATOMIC_ADD_RTN_32 = 0x0000000f, + TC_OP_ATOMIC_SUB_RTN_32 = 0x00000010, + TC_OP_ATOMIC_SMIN_RTN_32 = 0x00000011, + TC_OP_ATOMIC_UMIN_RTN_32 = 0x00000012, + TC_OP_ATOMIC_SMAX_RTN_32 = 0x00000013, + TC_OP_ATOMIC_UMAX_RTN_32 = 0x00000014, + TC_OP_ATOMIC_AND_RTN_32 = 0x00000015, + TC_OP_ATOMIC_OR_RTN_32 = 0x00000016, + TC_OP_ATOMIC_XOR_RTN_32 = 0x00000017, + TC_OP_ATOMIC_INC_RTN_32 = 0x00000018, + TC_OP_ATOMIC_DEC_RTN_32 = 0x00000019, + TC_OP_WBINVL1_VOL = 0x0000001a, + TC_OP_WBINVL1_SD = 0x0000001b, + TC_OP_RESERVED_NON_FLOAT_RTN_32_0 = 0x0000001c, + TC_OP_RESERVED_NON_FLOAT_RTN_32_1 = 0x0000001d, + TC_OP_RESERVED_NON_FLOAT_RTN_32_2 = 0x0000001e, + TC_OP_RESERVED_NON_FLOAT_RTN_32_3 = 0x0000001f, + TC_OP_WRITE = 0x00000020, + TC_OP_ATOMIC_FCMPSWAP_RTN_64 = 0x00000021, + TC_OP_ATOMIC_FMIN_RTN_64 = 0x00000022, + TC_OP_ATOMIC_FMAX_RTN_64 = 0x00000023, + TC_OP_RESERVED_FOP_RTN_64_0 = 0x00000024, + TC_OP_RESERVED_FOP_RTN_64_1 = 0x00000025, + TC_OP_RESERVED_FOP_RTN_64_2 = 0x00000026, + TC_OP_ATOMIC_SWAP_RTN_64 = 0x00000027, + TC_OP_ATOMIC_CMPSWAP_RTN_64 = 0x00000028, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_RTN_64 = 0x00000029, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_RTN_64 = 0x0000002a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_RTN_64 = 0x0000002b, + TC_OP_WBINVL2_SD = 0x0000002c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_64_0 = 0x0000002d, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_64_1 = 0x0000002e, + TC_OP_ATOMIC_ADD_RTN_64 = 0x0000002f, + TC_OP_ATOMIC_SUB_RTN_64 = 0x00000030, + TC_OP_ATOMIC_SMIN_RTN_64 = 0x00000031, + TC_OP_ATOMIC_UMIN_RTN_64 = 0x00000032, + TC_OP_ATOMIC_SMAX_RTN_64 = 0x00000033, + TC_OP_ATOMIC_UMAX_RTN_64 = 0x00000034, + TC_OP_ATOMIC_AND_RTN_64 = 0x00000035, + TC_OP_ATOMIC_OR_RTN_64 = 0x00000036, + TC_OP_ATOMIC_XOR_RTN_64 = 0x00000037, + TC_OP_ATOMIC_INC_RTN_64 = 0x00000038, + TC_OP_ATOMIC_DEC_RTN_64 = 0x00000039, + TC_OP_WBL2_NC = 0x0000003a, + TC_OP_WBL2_WC = 0x0000003b, + TC_OP_RESERVED_NON_FLOAT_RTN_64_1 = 0x0000003c, + TC_OP_RESERVED_NON_FLOAT_RTN_64_2 = 0x0000003d, + TC_OP_RESERVED_NON_FLOAT_RTN_64_3 = 0x0000003e, + TC_OP_RESERVED_NON_FLOAT_RTN_64_4 = 0x0000003f, + TC_OP_WBINVL1 = 0x00000040, + TC_OP_ATOMIC_FCMPSWAP_32 = 0x00000041, + TC_OP_ATOMIC_FMIN_32 = 0x00000042, + TC_OP_ATOMIC_FMAX_32 = 0x00000043, + TC_OP_RESERVED_FOP_32_0 = 0x00000044, + TC_OP_RESERVED_FOP_32_2 = 0x00000046, + TC_OP_ATOMIC_SWAP_32 = 0x00000047, + TC_OP_ATOMIC_CMPSWAP_32 = 0x00000048, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_32 = 0x00000049, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_32 = 0x0000004a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_32 = 0x0000004b, + TC_OP_INV_METADATA = 0x0000004c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_32_2 = 0x0000004e, + TC_OP_ATOMIC_ADD_32 = 0x0000004f, + TC_OP_ATOMIC_SUB_32 = 0x00000050, + TC_OP_ATOMIC_SMIN_32 = 0x00000051, + TC_OP_ATOMIC_UMIN_32 = 0x00000052, + TC_OP_ATOMIC_SMAX_32 = 0x00000053, + TC_OP_ATOMIC_UMAX_32 = 0x00000054, + TC_OP_ATOMIC_AND_32 = 0x00000055, + TC_OP_ATOMIC_OR_32 = 0x00000056, + TC_OP_ATOMIC_XOR_32 = 0x00000057, + TC_OP_ATOMIC_INC_32 = 0x00000058, + TC_OP_ATOMIC_DEC_32 = 0x00000059, + TC_OP_INVL2_NC = 0x0000005a, + TC_OP_NOP_RTN0 = 0x0000005b, + TC_OP_RESERVED_NON_FLOAT_32_1 = 0x0000005c, + TC_OP_RESERVED_NON_FLOAT_32_2 = 0x0000005d, + TC_OP_RESERVED_NON_FLOAT_32_3 = 0x0000005e, + TC_OP_RESERVED_NON_FLOAT_32_4 = 0x0000005f, + TC_OP_WBINVL2 = 0x00000060, + TC_OP_ATOMIC_FCMPSWAP_64 = 0x00000061, + TC_OP_ATOMIC_FMIN_64 = 0x00000062, + TC_OP_ATOMIC_FMAX_64 = 0x00000063, + TC_OP_RESERVED_FOP_64_0 = 0x00000064, + TC_OP_RESERVED_FOP_64_1 = 0x00000065, + TC_OP_RESERVED_FOP_64_2 = 0x00000066, + TC_OP_ATOMIC_SWAP_64 = 0x00000067, + TC_OP_ATOMIC_CMPSWAP_64 = 0x00000068, + TC_OP_ATOMIC_FCMPSWAP_FLUSH_DENORM_64 = 0x00000069, + TC_OP_ATOMIC_FMIN_FLUSH_DENORM_64 = 0x0000006a, + TC_OP_ATOMIC_FMAX_FLUSH_DENORM_64 = 0x0000006b, + TC_OP_RESERVED_FOP_FLUSH_DENORM_64_0 = 0x0000006c, + TC_OP_RESERVED_FOP_FLUSH_DENORM_64_1 = 0x0000006d, + TC_OP_RESERVED_FOP_FLUSH_DENORM_64_2 = 0x0000006e, + TC_OP_ATOMIC_ADD_64 = 0x0000006f, + TC_OP_ATOMIC_SUB_64 = 0x00000070, + TC_OP_ATOMIC_SMIN_64 = 0x00000071, + TC_OP_ATOMIC_UMIN_64 = 0x00000072, + TC_OP_ATOMIC_SMAX_64 = 0x00000073, + TC_OP_ATOMIC_UMAX_64 = 0x00000074, + TC_OP_ATOMIC_AND_64 = 0x00000075, + TC_OP_ATOMIC_OR_64 = 0x00000076, + TC_OP_ATOMIC_XOR_64 = 0x00000077, + TC_OP_ATOMIC_INC_64 = 0x00000078, + TC_OP_ATOMIC_DEC_64 = 0x00000079, + TC_OP_WBINVL2_NC = 0x0000007a, + TC_OP_NOP_ACK = 0x0000007b, + TC_OP_RESERVED_NON_FLOAT_64_1 = 0x0000007c, + TC_OP_RESERVED_NON_FLOAT_64_2 = 0x0000007d, + TC_OP_RESERVED_NON_FLOAT_64_3 = 0x0000007e, + TC_OP_RESERVED_NON_FLOAT_64_4 = 0x0000007f, + TC_OP_RESERVED_FOP_RTN_32_1__GFX09_10 = 0x00000005, + TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_32_1__GFX09_10 = 0x0000000d, + TC_OP_RESERVED_FOP_32_1__GFX09_10 = 0x00000045, + TC_OP_RESERVED_FOP_FLUSH_DENORM_32_1__GFX09_10 = 0x0000004d, + TC_OP_RESERVED_FADD_RTN_32__GFX11 = 0x00000005, + TC_OP_ATOMIC_FADD_FLUSH_DENORM_RTN_32__GFX11 = 0x0000000d, + TC_OP_RESERVED_FADD_32__GFX11 = 0x00000045, + TC_OP_ATOMIC_FADD_FLUSH_DENORM_32__GFX11 = 0x0000004d, +}; + +// Desc: Strucuture used to perform various atomic +// operations - add, subtract, increment, etc +struct AtomicTemplate { + PM4_MEC_ATOMIC_MEM atomic; +}; + +/// @brief PM4 command to write a 64-bit value into a memory +/// location accessible to Gpu +struct WriteDataTemplate { + PM4_MEC_WRITE_DATA write_data; +}; + +// ---------------------------------- MEC_COPY_DATA_src_sel_enum ---------------------------------- +enum MEC_COPY_DATA_src_sel_enum { + src_sel__mec_copy_data__mem_mapped_register = 0, + src_sel__mec_copy_data__tc_l2_obsolete = 1, + src_sel__mec_copy_data__tc_l2 = 2, + src_sel__mec_copy_data__gds = 3, + src_sel__mec_copy_data__perfcounters = 4, + src_sel__mec_copy_data__immediate_data = 5, + src_sel__mec_copy_data__atomic_return_data = 6, + src_sel__mec_copy_data__gds_atomic_return_data0 = 7, + src_sel__mec_copy_data__gds_atomic_return_data1 = 8, + src_sel__mec_copy_data__gpu_clock_count = 9, + src_sel__mec_copy_data__system_clock_count = 10, + src_sel__mec_copy_data__ext32perfcntr = 11, +}; + +// ---------------------------------- MEC_COPY_DATA_dst_sel_enum ---------------------------------- +enum MEC_COPY_DATA_dst_sel_enum { + dst_sel__mec_copy_data__mem_mapped_register = 0, + dst_sel__mec_copy_data__tc_l2 = 2, + dst_sel__mec_copy_data__gds = 3, + dst_sel__mec_copy_data__perfcounters = 4, + dst_sel__mec_copy_data__tc_l2_obsolete = 5, + dst_sel__mec_copy_data__mem_mapped_reg_dc = 6, + dst_sel__mec_copy_data__ext32perfcntr = 11, +}; + +// ------------------------------ MEC_COPY_DATA_src_cache_policy_enum ------------------------------ +enum MEC_COPY_DATA_src_cache_policy_enum { + src_cache_policy__mec_copy_data__lru = 0, + src_cache_policy__mec_copy_data__stream = 1, + src_cache_policy__mec_copy_data__noa = 2, + src_cache_policy__mec_copy_data__bypass = 3, +}; + +// --------------------------------- MEC_COPY_DATA_count_sel_enum --------------------------------- +enum MEC_COPY_DATA_count_sel_enum { + count_sel__mec_copy_data__32_bits_of_data = 0, + count_sel__mec_copy_data__64_bits_of_data = 1, +}; + +// --------------------------------- MEC_COPY_DATA_wr_confirm_enum --------------------------------- +enum MEC_COPY_DATA_wr_confirm_enum { + wr_confirm__mec_copy_data__do_not_wait_for_confirmation = 0, + wr_confirm__mec_copy_data__wait_for_confirmation = 1, +}; + +// ------------------------------ MEC_COPY_DATA_dst_cache_policy_enum ------------------------------ +enum MEC_COPY_DATA_dst_cache_policy_enum { + dst_cache_policy__mec_copy_data__lru = 0, + dst_cache_policy__mec_copy_data__stream = 1, + dst_cache_policy__mec_copy_data__noa = 2, + dst_cache_policy__mec_copy_data__bypass = 3, +}; + +// ------------------------------- MEC_COPY_DATA_pq_exe_status_enum ------------------------------- +enum MEC_COPY_DATA_pq_exe_status_enum { + pq_exe_status__mec_copy_data__default = 0, + pq_exe_status__mec_copy_data__phase_update = 1, +}; + +// ------------------------------- MEC_WRITE_DATA_dst_sel_enum ------------------------------- +enum MEC_WRITE_DATA_dst_sel_enum { + dst_sel__mec_write_data__mem_mapped_register = 0, + dst_sel__mec_write_data__tc_l2 = 2, + dst_sel__mec_write_data__gds = 3, + dst_sel__mec_write_data__memory = 5, + dst_sel__mec_write_data__memory_mapped_adc_persistent_state = 6 }; + +// ------------------------------- MEC_WRITE_DATA_addr_incr_enum ------------------------------- +enum MEC_WRITE_DATA_addr_incr_enum { + addr_incr__mec_write_data__increment_address = 0, + addr_incr__mec_write_data__do_not_increment_address = 1 }; + +// ------------------------------- MEC_WRITE_DATA_wr_confirm_enum ------------------------------- +enum MEC_WRITE_DATA_wr_confirm_enum { + wr_confirm__mec_write_data__do_not_wait_for_write_confirmation = 0, + wr_confirm__mec_write_data__wait_for_write_confirmation = 1 }; + +// ------------------------------- MEC_WRITE_DATA_cache_policy_enum ------------------------------- +enum MEC_WRITE_DATA_cache_policy_enum { + cache_policy__mec_write_data__lru = 0, + cache_policy__mec_write_data__stream = 1, + cache_policy__mec_write_data__noa = 2, + cache_policy__mec_write_data__bypass = 3 }; + +typedef struct PM4_MEC_COPY_DATA { + union { + PM4_MEC_TYPE_3_HEADER header; /// header + uint32_t ordinal1; + }; + union { + struct { + uint32_t src_sel : 4; + uint32_t reserved1 : 4; + uint32_t dst_sel : 4; + uint32_t reserved2 : 1; + uint32_t src_cache_policy : 2; + uint32_t reserved3 : 1; + uint32_t count_sel : 1; + uint32_t reserved4 : 3; + uint32_t wr_confirm : 1; + uint32_t reserved5 : 4; + uint32_t dst_cache_policy : 2; + uint32_t reserved6 : 2; + uint32_t pq_exe_status : 1; + uint32_t reserved7 : 2; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t src_reg_offset : 18; + uint32_t reserved8 : 14; + } bitfields3a; + struct { + uint32_t reserved9 : 2; + uint32_t src_32b_addr_lo : 30; + } bitfields3b; + struct { + uint32_t reserved10 : 3; + uint32_t src_64b_addr_lo : 29; + } bitfields3c; + struct { + uint32_t src_gds_addr_lo : 16; + uint32_t reserved11 : 16; + } bitfields3d; + uint32_t imm_data; + uint32_t ordinal3; + }; + union { + uint32_t src_memtc_addr_hi; + uint32_t src_imm_data; + uint32_t ordinal4; + }; + union { + struct { + uint32_t dst_reg_offset : 18; + uint32_t reserved12 : 14; + } bitfields5a; + struct { + uint32_t reserved13 : 2; + uint32_t dst_32b_addr_lo : 30; + } bitfields5b; + struct { + uint32_t reserved14 : 3; + uint32_t dst_64b_addr_lo : 29; + } bitfields5c; + struct { + uint32_t dst_gds_addr_lo : 16; + uint32_t reserved15 : 16; + } bitfields5d; + uint32_t ordinal5; + }; + uint32_t dst_addr_hi; +} PM4MEC_COPY_DATA; +namespace gfx9 { + +struct PM4_MEC_ACQUIRE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t coher_cntl:31; + uint32_t reserved1:1; + } bitfields2; + uint32_t ordinal2; + }; + uint32_t coher_size; + union { + struct { + uint32_t coher_size_hi:8; + uint32_t reserved2:24; + } bitfields4; + uint32_t ordinal4; + }; + uint32_t coher_base_lo; + union { + struct { + uint32_t coher_base_hi:24; + uint32_t reserved3:8; + } bitfields6; + uint32_t ordinal6; + }; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved4:16; + } bitfields7; + uint32_t ordinal7; + }; +}; + +struct PM4_MEC_RELEASE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t tcl1_vol_action_ena:1; + uint32_t tc_vol_action_ena:1; + uint32_t reserved2:1; + uint32_t tc_wb_action_ena:1; + uint32_t tcl1_action_ena:1; + uint32_t tc_action_ena:1; + uint32_t reserved3:1; + uint32_t tc_nc_action_ena:1; + uint32_t tc_wc_action_ena:1; + uint32_t tc_md_action_ena:1; + uint32_t reserved4:3; + uint32_t cache_policy:2; + uint32_t reserved5:2; + uint32_t pq_exe_status:1; + uint32_t reserved6:2; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved7:16; + uint32_t dst_sel:2; + uint32_t reserved8:6; + uint32_t int_sel:3; + uint32_t reserved9:2; + uint32_t data_sel:3; + } bitfields3; + uint32_t ordinal3; + }; + union { + struct { + uint32_t reserved10:2; + uint32_t address_lo_32b:30; + } bitfields4a; + struct { + uint32_t reserved11:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved12; + uint32_t ordinal4; + }; + union { + uint32_t address_hi; + uint32_t reserved13; + uint32_t ordinal5; + }; + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved14; + uint32_t ordinal6; + }; + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved15; + uint32_t reserved16; + uint32_t ordinal7; + }; + uint32_t int_ctxid; +}; + +struct PM4_MEC_WAIT_REG_MEM64 { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t function:3; + uint32_t reserved1:1; + uint32_t mem_space:2; + uint32_t operation:2; + uint32_t reserved2:24; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved3:3; + uint32_t mem_poll_addr_lo:29; + } bitfields3a; + struct { + uint32_t reg_poll_addr:18; + uint32_t reserved4:14; + } bitfields3b; + struct { + uint32_t reg_write_addr1:18; + uint32_t reserved5:14; + } bitfields3c; + uint32_t ordinal3; + }; + union { + uint32_t mem_poll_addr_hi; + struct { + uint32_t reg_write_addr2:18; + uint32_t reserved6:14; + } bitfields4b; + uint32_t ordinal4; + }; + uint32_t reference; + uint32_t reference_hi; + uint32_t mask; + uint32_t mask_hi; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved7:16; + } bitfields9; + uint32_t ordinal9; + }; +}; + +/// @brief Structure used to configure the flushing of +/// various caches - instruction, constants, L1 and L2 +struct AcquireMemTemplate { + PM4_MEC_ACQUIRE_MEM acquire_mem; +}; + +struct EndofKernelNotifyTemplate { + PM4_MEC_RELEASE_MEM release_mem; +}; + +/// @brief PM4 command to wait for a certain event before proceeding +/// to process another command on the queue +struct WaitRegMem64Template { + PM4_MEC_WAIT_REG_MEM64 wait_reg_mem; +}; + +} // gfx9 namespace + +namespace gfx10 { + +struct PM4_MEC_ACQUIRE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + uint32_t reserved1; + uint32_t coher_size; + union { + struct { + uint32_t coher_size_hi:8; + uint32_t reserved2:24; + } bitfields4; + uint32_t ordinal4; + }; + uint32_t coher_base_lo; + union { + struct { + uint32_t coher_base_hi:24; + uint32_t reserved3:8; + } bitfields6; + uint32_t ordinal6; + }; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved4:16; + } bitfields7; + uint32_t ordinal7; + }; + union { + struct { + uint32_t gcr_cntl:19; + uint32_t reserved4:13; + } bitfields8; + uint32_t ordinal8; + }; +}; + +struct PM4_MEC_RELEASE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t gcr_cntl:12; + uint32_t reserved2:1; + uint32_t cache_policy:2; + uint32_t reserved3:2; + uint32_t pq_exe_status:1; + uint32_t reserved4:2; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved7:16; + uint32_t dst_sel:2; + uint32_t reserved8:2; + uint32_t mes_intr_pipe:2; + uint32_t mes_action_id:2; + uint32_t int_sel:3; + uint32_t reserved9:2; + uint32_t data_sel:3; + } bitfields3; + uint32_t ordinal3; + }; + union { + struct { + uint32_t reserved10:2; + uint32_t address_lo_32b:30; + } bitfields4a; + struct { + uint32_t reserved11:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved12; + uint32_t ordinal4; + }; + union { + uint32_t address_hi; + uint32_t reserved13; + uint32_t ordinal5; + }; + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved14; + uint32_t ordinal6; + }; + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved15; + uint32_t reserved16; + uint32_t ordinal7; + }; + uint32_t int_ctxid; +}; + +struct PM4_MEC_WAIT_REG_MEM64 { + union { + PM4_MEC_TYPE_3_HEADER header; ///header + uint32_t ordinal1; + }; + union { + struct { + uint32_t function:3; + uint32_t reserved1:1; + uint32_t mem_space:2; + uint32_t operation:2; + uint32_t reserved2:14; + uint32_t mes_intr_pipe:2; + uint32_t mes_action:1; + uint32_t cache_policy:2; + uint32_t reserved3:5; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved4:3; + uint32_t mem_poll_addr_lo:29; + } bitfields3a; + struct { + uint32_t reg_poll_addr:18; + uint32_t reserved5:14; + } bitfields3b; + struct { + uint32_t reg_write_addr1:18; + uint32_t reserved6:14; + } bitfields3c; + uint32_t ordinal3; + }; + union { + uint32_t mem_poll_addr_hi; + struct { + uint32_t reg_write_addr2:18; + uint32_t reserved7:14; + } bitfields4b; + uint32_t ordinal4; + }; + uint32_t reference; + uint32_t reference_hi; + uint32_t mask; + uint32_t mask_hi; + union { + struct { + uint32_t poll_interval:16; + uint32_t reserved8:15; + uint32_t optimize_ace_offload_mode:1; + } bitfields9; + uint32_t ordinal9; + }; +}; + +/// @brief Structure used to configure the flushing of +/// various caches - instruction, constants, L1 and L2 +struct AcquireMemTemplate { + PM4_MEC_ACQUIRE_MEM acquire_mem; +}; + +struct EndofKernelNotifyTemplate { + PM4_MEC_RELEASE_MEM release_mem; +}; + +struct WaitRegMem64Template { + PM4_MEC_WAIT_REG_MEM64 wait_reg_mem; +}; + +} // gfx10 namespace + +namespace gfx11 { + +struct PM4_MEC_RELEASE_MEM { + union { + PM4_MEC_TYPE_3_HEADER header; + uint32_t ordinal1; + }; + union { + struct { + uint32_t event_type:6; + uint32_t reserved1:2; + uint32_t event_index:4; + uint32_t gcr_cntl:13; + uint32_t cache_policy:2; + uint32_t reserved2:1; + uint32_t pq_exe_status:1; + uint32_t reserved3:1; + uint32_t glk_inv:1; + uint32_t reserved4:1; + } bitfields2; + uint32_t ordinal2; + }; + union { + struct { + uint32_t reserved5:16; + uint32_t dst_sel:2; + uint32_t reserved6:2; + uint32_t mes_intr_pipe:2; + uint32_t mes_action_id:2; + uint32_t int_sel:3; + uint32_t reserved7:2; + uint32_t data_sel:3; + } bitfields3; + uint32_t ordinal3; + }; + union { + struct { + uint32_t reserved8:2; + uint32_t address_lo_32b:30; + } bitfields4a; + struct { + uint32_t reserved9:3; + uint32_t address_lo_64b:29; + } bitfields4b; + uint32_t reserved10; + uint32_t ordinal4; + }; + union { + uint32_t address_hi; + uint32_t reserved11; + uint32_t ordinal5; + }; + union { + uint32_t data_lo; + uint32_t cmp_data_lo; + struct { + uint32_t dw_offset:16; + uint32_t num_dwords:16; + } bitfields6c; + uint32_t reserved12; + uint32_t ordinal6; + }; + union { + uint32_t data_hi; + uint32_t cmp_data_hi; + uint32_t reserved13; + uint32_t reserved14; + uint32_t ordinal7; + }; + uint32_t int_ctxid; +}; + +struct EndofKernelNotifyTemplate { + PM4_MEC_RELEASE_MEM release_mem; +}; + +} // gfx11 namespace + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/registers.h b/projects/rocr-runtime/libhsakmt/include/impl/registers.h new file mode 100644 index 0000000000..4d430b41e4 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/registers.h @@ -0,0 +1,363 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// This file is used only for open source cmake builds, if we hardcode the +// register values in amd_aql_queue.cpp then this file won't be required. For +// now we are using this file where register details are spelled out in the +// structs/unions below. +#ifndef _WSL_INC_REGISTERS_H_ +#define _WSL_INC_REGISTERS_H_ + +typedef enum SQ_RSRC_BUF_TYPE { +SQ_RSRC_BUF = 0x00000000, +SQ_RSRC_BUF_RSVD_1 = 0x00000001, +SQ_RSRC_BUF_RSVD_2 = 0x00000002, +SQ_RSRC_BUF_RSVD_3 = 0x00000003, +} SQ_RSRC_BUF_TYPE; + +typedef enum BUF_DATA_FORMAT { +BUF_DATA_FORMAT_INVALID = 0x00000000, +BUF_DATA_FORMAT_8 = 0x00000001, +BUF_DATA_FORMAT_16 = 0x00000002, +BUF_DATA_FORMAT_8_8 = 0x00000003, +BUF_DATA_FORMAT_32 = 0x00000004, +BUF_DATA_FORMAT_16_16 = 0x00000005, +BUF_DATA_FORMAT_10_11_11 = 0x00000006, +BUF_DATA_FORMAT_11_11_10 = 0x00000007, +BUF_DATA_FORMAT_10_10_10_2 = 0x00000008, +BUF_DATA_FORMAT_2_10_10_10 = 0x00000009, +BUF_DATA_FORMAT_8_8_8_8 = 0x0000000a, +BUF_DATA_FORMAT_32_32 = 0x0000000b, +BUF_DATA_FORMAT_16_16_16_16 = 0x0000000c, +BUF_DATA_FORMAT_32_32_32 = 0x0000000d, +BUF_DATA_FORMAT_32_32_32_32 = 0x0000000e, +BUF_DATA_FORMAT_RESERVED_15 = 0x0000000f, +} BUF_DATA_FORMAT; + +typedef enum BUF_NUM_FORMAT { +BUF_NUM_FORMAT_UNORM = 0x00000000, +BUF_NUM_FORMAT_SNORM = 0x00000001, +BUF_NUM_FORMAT_USCALED = 0x00000002, +BUF_NUM_FORMAT_SSCALED = 0x00000003, +BUF_NUM_FORMAT_UINT = 0x00000004, +BUF_NUM_FORMAT_SINT = 0x00000005, +BUF_NUM_FORMAT_SNORM_OGL__SI__CI = 0x00000006, +BUF_NUM_FORMAT_RESERVED_6__VI = 0x00000006, +BUF_NUM_FORMAT_FLOAT = 0x00000007, +} BUF_NUM_FORMAT; + +typedef enum BUF_FORMAT { +BUF_FORMAT_32_UINT = 0x00000014, +} BUF_FORMAT; + +typedef enum SQ_SEL_XYZW01 { +SQ_SEL_0 = 0x00000000, +SQ_SEL_1 = 0x00000001, +SQ_SEL_RESERVED_0 = 0x00000002, +SQ_SEL_RESERVED_1 = 0x00000003, +SQ_SEL_X = 0x00000004, +SQ_SEL_Y = 0x00000005, +SQ_SEL_Z = 0x00000006, +SQ_SEL_W = 0x00000007, +} SQ_SEL_XYZW01; + + union COMPUTE_TMPRING_SIZE { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 13; + unsigned int : 7; +#elif defined(BIGENDIAN_CPU) + unsigned int : 7; + unsigned int WAVESIZE : 13; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union COMPUTE_TMPRING_SIZE_GFX11 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 15; + unsigned int : 5; +#elif defined(BIGENDIAN_CPU) + unsigned int : 5; + unsigned int WAVESIZE : 15; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union COMPUTE_TMPRING_SIZE_GFX12 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int WAVES : 12; + unsigned int WAVESIZE : 18; + unsigned int : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int : 2; + unsigned int WAVESIZE : 18; + unsigned int WAVES : 12; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union SQ_BUF_RSRC_WORD0 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS : 32; +#elif defined(BIGENDIAN_CPU) + unsigned int BASE_ADDRESS : 32; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD1 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS_HI : 16; + unsigned int STRIDE : 14; + unsigned int CACHE_SWIZZLE : 1; + unsigned int SWIZZLE_ENABLE : 1; +#elif defined(BIGENDIAN_CPU) + unsigned int SWIZZLE_ENABLE : 1; + unsigned int CACHE_SWIZZLE : 1; + unsigned int STRIDE : 14; + unsigned int BASE_ADDRESS_HI : 16; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union SQ_BUF_RSRC_WORD1_GFX11 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int BASE_ADDRESS_HI : 16; + unsigned int STRIDE : 14; + unsigned int SWIZZLE_ENABLE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int SWIZZLE_ENABLE : 2; + unsigned int STRIDE : 14; + unsigned int BASE_ADDRESS_HI : 16; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD2 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int NUM_RECORDS : 32; +#elif defined(BIGENDIAN_CPU) + unsigned int NUM_RECORDS : 32; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + + union SQ_BUF_RSRC_WORD3 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int NUM_FORMAT : 3; + unsigned int DATA_FORMAT : 4; + unsigned int ELEMENT_SIZE : 2; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int ATC__CI__VI : 1; + unsigned int HASH_ENABLE : 1; + unsigned int HEAP : 1; + unsigned int MTYPE__CI__VI : 3; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int MTYPE__CI__VI : 3; + unsigned int HEAP : 1; + unsigned int HASH_ENABLE : 1; + unsigned int ATC__CI__VI : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int ELEMENT_SIZE : 2; + unsigned int DATA_FORMAT : 4; + unsigned int NUM_FORMAT : 3; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + union SQ_BUF_RSRC_WORD3_GFX10 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 7; + unsigned int RESERVED1 : 2; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int RESOURCE_LEVEL : 1; + unsigned int RESERVED2 : 3; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int RESERVED2 : 3; + unsigned int RESOURCE_LEVEL : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 2; + unsigned int FORMAT : 7; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + + // From V# Table + union SQ_BUF_RSRC_WORD3_GFX11 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 6; + unsigned int RESERVED1 : 3; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int RESERVED2 : 4; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int RESERVED2 : 4; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 3; + unsigned int FORMAT : 6; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; + // From V# Table + union SQ_BUF_RSRC_WORD3_GFX12 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int DST_SEL_X : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_W : 3; + unsigned int FORMAT : 6; + unsigned int RESERVED1 : 3; + unsigned int INDEX_STRIDE : 2; + unsigned int ADD_TID_ENABLE : 1; + unsigned int WRITE_COMPRESS_ENABLE : 1; + unsigned int COMPRESSION_EN : 1; + unsigned int COMPRESSION_ACCESS_MODE : 2; + unsigned int OOB_SELECT : 2; + unsigned int TYPE : 2; +#elif defined(BIGENDIAN_CPU) + unsigned int TYPE : 2; + unsigned int OOB_SELECT : 2; + unsigned int COMPRESSION_ACCESS_MODE : 2; + unsigned int COMPRESSION_EN : 1; + unsigned int WRITE_COMPRESS_ENABLE : 1; + unsigned int ADD_TID_ENABLE : 1; + unsigned int INDEX_STRIDE : 2; + unsigned int RESERVED1 : 3; + unsigned int FORMAT : 6; + unsigned int DST_SEL_W : 3; + unsigned int DST_SEL_Z : 3; + unsigned int DST_SEL_Y : 3; + unsigned int DST_SEL_X : 3; +#endif + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; + }; +#endif // header guard diff --git a/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/thunk_proxy.h b/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/thunk_proxy.h new file mode 100644 index 0000000000..d6bdce2451 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/thunk_proxy.h @@ -0,0 +1,122 @@ +#ifndef _WSL_INC_THUNK_PROXY_H_ +#define _WSL_INC_THUNK_PROXY_H_ + +#include + +namespace thunk_proxy { +enum AllocDomain { + kSystem, + kLocal, + kUserMemory, + kUserQueue, + kDomainCount, +}; + +enum MemFlag { + kFineGrain = (1ULL << 0), + kKernarg = (1ULL << 1), +}; + +enum EngineFlag { + KCOMPUTE0 = (1ULL << 0), + KDRMDMA = (1ULL << 1), + KDRMDMA1 = (1ULL << 2), +}; + +enum SchedLevel { + kLow = 0, + kNormal = 1, + kHigh = 2, +}; + +struct HwsInfo { + union { + struct { + uint32_t gfxHwsEnabled : 1; + uint32_t computeHwsEnabled : 1; + uint32_t dmaHwsEnabled : 1; + uint32_t dma1HwsEnabled : 1; + uint32_t reserved : 28; + } hwsMask; + uint32_t osHwsEnableFlags; + }; + uint64_t engineOrdinalMask; // Indicates which engines (by ordinal) support MES HWS +}; + +typedef struct { + int major; + int minor; + int stepping; + bool is_dgpu; + char product_name[MAX_PATH]; + uint64_t uuid; + uint32_t family; + uint32_t device_id; + uint32_t wavefront_size; + uint32_t compute_unit_count; + uint32_t max_engine_clock_mhz; + uint32_t watch_points_num; + uint32_t pci_bus_addr; + uint32_t memory_bus_width; + uint32_t max_memory_clock_mhz; + uint64_t gpu_counter_frequency; + uint32_t wave_per_cu; + uint32_t simd_per_cu; + uint32_t max_scratch_slots_per_cu; + uint32_t num_shader_engine; + uint32_t shader_array_per_shader_engine; + uint32_t domain; + uint32_t num_gws; + uint32_t asic_revision; + uint64_t local_visible_heap_size; + uint64_t local_invisible_heap_size; + uint64_t non_local_heap_size; + uint64_t private_aperture_base; + uint64_t private_aperture_size; + uint64_t shared_aperture_base; + uint64_t shared_aperture_size; + uint32_t user_queue_size; + uint32_t lds_size; + uint32_t big_page_alignment_size; + uint32_t hw_big_page_min_alignment_size; + uint32_t hw_big_page_alignment_size; + bool enable_big_page_alignment; + uint32_t mec_fw_version; + uint32_t sdma_fw_version; + uint32_t l1_cache_size; + uint32_t l2_cache_size; + uint32_t l3_cache_size; + uint32_t gl2_cacheline_size; + uint32_t num_cp_queues; + HwsInfo hwsInfo; + std::vector sdma_schedid; + uint32_t compute_schedid; + bool state_shadowing_by_cpfw; + bool platform_atomic_support; + void *adapter_info; + uint32_t kmd_version; +} DeviceInfo; + +int EngineOrdinal(int engine, DeviceInfo *device_info); +bool GetHwsEnabled(int engine, DeviceInfo *device_info); +bool ShouldDisableGpuTimeout(int engine, DeviceInfo *device_info); +bool ParseAdapterInfo(D3DKMT_HANDLE adapter, DeviceInfo *device_info); +bool QueryAdapterSupported(unsigned int device_id); + +uint32_t QueueEngine2EngineFlag(uint32_t queue_engine); +void SetAllocationInfo(void *data, uint64_t size, AllocDomain domain, + uint64_t addr, uint32_t mem_flags, uint32_t engine_flag, const DeviceInfo &device_info); +void GetAllocPrivDataSize(int *priv_drv_data_size, int *priv_alloc_data_size); +void FillinAllocPrivDrvData(void *drv_priv, int priv_alloc_data_size); + +int GetSubmitPrivDataSize(); +void FillinSubmitPrivData(void *priv_data, D3DKMT_HANDLE queue, uint64_t command_addr, + uint64_t command_size, bool is_hw_queue); +int GetHwQueuePrivDataSize(); +void FillinHwQueuePrivData(void *priv_data, bool FwManagedGfxState, SchedLevel level = kNormal); +int GetContextPrivDataSize(); +void FillinContextPrivData(void *priv_data, bool FwManagedGfxState); +int GetPowerOptPrivDataSize(); +void FillinPowerOptPrivData(void *priv_data, bool restore); +} +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/wddm_types.h b/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/wddm_types.h new file mode 100644 index 0000000000..3fd3f69553 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/thunk_proxy/wddm_types.h @@ -0,0 +1,169 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_THUNK_PROXY_WDDM_TYPES_H_ +#define _WSL_INC_THUNK_PROXY_WDDM_TYPES_H_ + +#include + +#include + +typedef uint32_t UINT, *UINT_PTR; +typedef int32_t INT32; +typedef int32_t LONG; +typedef uint32_t ULONG, *ULONG_PTR; +typedef int64_t LONGLONG; +typedef int64_t LONG64; +typedef uint64_t ULONGLONG; +typedef uint64_t ULONG64, *ULONG64_PTR; +typedef uint8_t BYTE; +typedef uint16_t WORD; +typedef uint32_t DWORD; +typedef int32_t BOOL; +typedef int32_t NTSTATUS; +typedef uint16_t USHORT; +typedef uint16_t UINT16; +typedef uint32_t UINT32; +typedef uint64_t UINT64; +typedef int32_t INT; +typedef uint64_t SIZE_T; +typedef void VOID; +typedef float FLOAT; +typedef char CHAR; +typedef unsigned char UCHAR; +typedef UCHAR BOOLEAN; +typedef int16_t WCHAR; +typedef void *HANDLE; +typedef void *PVOID; +typedef void *LPVOID; +typedef const int16_t *PCWSTR; + +#define ULONG ULONG +#define ULONG_PTR ULONG_PTR +#define USHORT USHORT + +#define DECLARE_HANDLE(name) struct name##__{int unused;}; typedef struct name##__ *name +#define C_ASSERT(e) typedef char __C_ASSERT__[(e)?1:-1] + +DECLARE_HANDLE(HWND); +DECLARE_HANDLE(HDC); +DECLARE_HANDLE(PALETTEENTRY); + +typedef struct tagPOINT { + LONG x; + LONG y; +} POINT; + +typedef struct tagRECT { + LONG left; + LONG top; + LONG right; + LONG bottom; +} RECT; + +typedef struct tagRECTL { + LONG left; + LONG top; + LONG right; + LONG bottom; +} RECTL; + +typedef union _LARGE_INTEGER { + struct { + DWORD LowPart; + DWORD HighPart; + } u; + LONGLONG QuadPart; +} LARGE_INTEGER; + +typedef LARGE_INTEGER *PLARGE_INTEGER; + +typedef union _ULARGE_INTEGER { + struct { + ULONG LowPart; + ULONG HighPart; + } DUMMYSTRUCTNAME; + struct { + ULONG LowPart; + ULONG HighPart; + } u; + ULONGLONG QuadPart; +} ULARGE_INTEGER; + +typedef ULARGE_INTEGER *PULARGE_INTEGER; + +typedef struct _LUID { + ULONG LowPart; + LONG HighPart; +} LUID, *PLUID; + +typedef enum _DEVICE_POWER_STATE { + PowerDeviceUnspecified = 0, + PowerDeviceD0, + PowerDeviceD1, + PowerDeviceD2, + PowerDeviceD3, + PowerDeviceMaximum +} DEVICE_POWER_STATE, *PDEVICE_POWER_STATE; + +#define _Check_return_ +#define APIENTRY +#define CONST const +#define IN +#define OUT +#define FAR +#define MAX_PATH 260 +#define __stdcall + +#ifndef GUID_DEFINED +#define GUID_DEFINED +typedef struct _GUID { + uint32_t Data1; + uint16_t Data2; + uint16_t Data3; + uint8_t Data4[ 8 ]; +} GUID; +#endif + +#include + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/cmd_util.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/cmd_util.h new file mode 100644 index 0000000000..f1e7d22d91 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/cmd_util.h @@ -0,0 +1,82 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */ + +#ifndef _WSL_INC_WDDM_CMD_UTIL_H_ +#define _WSL_INC_WDDM_CMD_UTIL_H_ + +#include +#include "impl/hsa/hsa.h" +#include "impl/hsa/amd_hsa_queue.h" +#include "impl/hsa/amd_hsa_kernel_code.h" +#include "impl/pm4_cmds.h" +#include "util/utils.h" + +namespace wsl { +namespace thunk { + +struct DispatchInfo { + uint8_t major; + hsa_kernel_dispatch_packet_t *pPacket; + void *pEntry; + const amd_kernel_code_t *pKernelObject; + uint32_t ldsBlks; + amd_queue_v2_t *pAmdQueue; + bool wave32; + uint32_t srd; + void *pScratchBase; + uint32_t scratchSizePerWave; + uint32_t scratchBaseOffset[2]; + uint32_t offsetCnt; +}; + +class CmdUtil { +public: + CmdUtil() {}; + ~CmdUtil() {}; + + static size_t BuildCopyData( + uint64_t *pDstAddr, + void *pBuffer, + uint32_t dstSel = dst_sel__mec_copy_data__tc_l2, + uint32_t dstCachePolicy = dst_cache_policy__mec_copy_data__stream, + uint32_t srcSel = src_sel__mec_copy_data__gpu_clock_count, + uint32_t srcCachePolicy = src_cache_policy__mec_copy_data__lru, + uint32_t countSel = count_sel__mec_copy_data__64_bits_of_data, + uint32_t wrConfirm = wr_confirm__mec_copy_data__wait_for_confirmation); + + static size_t BuildBarrier( + void *pBuffer, + uint32_t eventIndex = event_index__mec_event_write__cs_partial_flush, + uint32_t eventType = CS_PARTIAL_FLUSH); + + static size_t BuildWriteData64Command( + void *pBuffer, + uint64_t* write_addr, + uint64_t write_value); + + static size_t BuildAcquireMem( + uint8_t major, + void *pBuffer); + + static size_t BuildScratch( + void *pScratchBase, + void *pBuffer); + + static size_t BuildComputeShaderParams( + void *pBuffer); + + static size_t BuildDispatch( + struct DispatchInfo *pInfo, + void *pBuffer); + + static size_t BuildAtomicMem( + uint64_t *pAddr, + uint32_t atomic, + void *pBuffer, + uint32_t cachePolicy = cache_policy__mec_atomic_mem__stream, + uint64_t srcData = 1); +}; + +} // namespace thunk +} // namespace wsl + +#endif \ No newline at end of file diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/device.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/device.h new file mode 100644 index 0000000000..15821b5483 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/device.h @@ -0,0 +1,246 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_DEVICE_H_ +#define _WSL_INC_WDDM_DEVICE_H_ + +#include +#include + +#include +#include +#include + +#include "impl/wddm/types.h" +#include "impl/thunk_proxy/thunk_proxy.h" +#include "impl/wddm/va_mgr.h" +#include "impl/wddm/status.h" +#include "impl/wddm/types.h" +#include "impl/wddm/gpu_memory.h" +#include "impl/wddm/cmd_util.h" + +namespace wsl { +namespace thunk { + +//class Queue; +class WDDMQueue; + +// WSL2 hyperv GPADL protocol limitation +#define MAX_USERPTR_BLOCK_SIZE 0xf0000000 +#define START_NON_CANONICAL_ADDR (1ULL << 47) +#define END_NON_CANONICAL_ADDR (~0UL - (1UL << 47)) +#define IS_OVERLAPPING(start1, size1, start2, size2) \ + ((start1 < (start2 + size2)) && (start2 < (start1 + size1))) + +struct SegmentInfo { + uint32_t segment_id; + uint32_t segment_type; // 0=aperture, 1=gpu memory, 2=system memory + bool aperture; + bool system_memory; + uint64_t commit_limit; + + SegmentInfo() + : segment_id(0), segment_type(0), aperture(false), + system_memory(false), commit_limit(0) {} +}; + +class WDDMDevice { +public: + static constexpr size_t GpuMemoryChunkSize = 2 * (1ULL << 30); // 2 GB + + WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid, uint32_t node_id); + ~WDDMDevice(); + + int NodeId() const { return node_id_; } + int Major() { return device_info_.major; } + int Minor() { return device_info_.minor; } + int Stepping() { return device_info_.stepping; } + bool IsDgpu() { return device_info_.is_dgpu; } + const char *ProductName() { return device_info_.product_name; } + uint64_t Uuid() { return device_info_.uuid; } + uint32_t GfxFamily() { return device_info_.family; } + uint32_t DeviceId() { return device_info_.device_id; } + uint32_t WavefrontSize() { return device_info_.wavefront_size; } + uint32_t ComputeUnitCount() { return device_info_.compute_unit_count; } + uint32_t MaxEngineClockMhz() { return device_info_.max_engine_clock_mhz; } + uint32_t WatchPointsNum() { return device_info_.watch_points_num; } + uint32_t PciBusAddr() { return device_info_.pci_bus_addr; } + + uint32_t MemoryBusWidth() { return device_info_.memory_bus_width; } + uint32_t MaxMemoryClockMhz() { return device_info_.max_memory_clock_mhz; } + uint32_t WavePerCu() { return device_info_.wave_per_cu; } + uint32_t SimdPerCu() { return device_info_.simd_per_cu; } + uint32_t MaxScratchSlotsPerCu() { return device_info_.max_scratch_slots_per_cu; } + uint32_t NumShaderEngine() { return device_info_.num_shader_engine; } + uint32_t ShaderArrayPerShaderEngine() { return device_info_.shader_array_per_shader_engine; } + uint32_t NumSdmaEngine() { return device_info_.sdma_schedid.size(); } + uint32_t Domain() { return device_info_.domain; } + uint32_t NumGws() { return device_info_.num_gws; } + uint32_t AsicRevision() { return device_info_.asic_revision; } + uint64_t LocalHeapSize() { return device_info_.local_visible_heap_size + device_info_.local_invisible_heap_size; } + uint64_t LocalVisibleHeapSize() { return device_info_.local_visible_heap_size; } + uint64_t LocalInvisibleHeapSize() { return device_info_.local_invisible_heap_size; } + uint64_t NonLocalHeapSize() { return device_info_.non_local_heap_size; } + uint64_t PrivateApertureBase() { return device_info_.private_aperture_base; } + uint64_t PrivateApertureSize() { return device_info_.private_aperture_size; } + uint64_t SharedApertureBase() { return device_info_.shared_aperture_base; } + uint64_t SharedApertureSize() { return device_info_.shared_aperture_size; } + uint32_t LdsSize() { return device_info_.lds_size; } + uint64_t GPUCounterFrequency() { return device_info_.gpu_counter_frequency; } + uint32_t GetSwsQueueSize(void) const { return device_info_.user_queue_size; } + uint32_t GetMecFwVersion() { return device_info_.mec_fw_version; } + uint32_t GetSdmaFwVersion() { return device_info_.sdma_fw_version; } + uint32_t GetL1CacheSize() { return device_info_.l1_cache_size; } + uint32_t GetL2CacheSize() { return device_info_.l2_cache_size; } + uint32_t GetL3CacheSize() { return device_info_.l3_cache_size; } + uint32_t Gl2CacheLineSize() { return device_info_.gl2_cacheline_size; } + bool SupportStateShadowingByCpFw(void) const { return device_info_.state_shadowing_by_cpfw; } + bool SupportPlatformAtomic(void) const { return device_info_.platform_atomic_support; } + uint32_t GetSdmaEngine(uint32_t idx) { + assert(idx < NumSdmaEngine()); + return device_info_.sdma_schedid[idx]; + } + uint32_t GetComputeEngine() { return device_info_.compute_schedid; } + + uint64_t VramAvail(); + + void GetClockCounters(uint64_t *gpu, uint64_t *cpu); + uint32_t GetNumCpQueues() { return device_info_.num_cp_queues; } + + bool CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr); + void DestroySyncobj(D3DKMT_HANDLE handle); + + bool CreateQueue(WDDMQueue *queue); + void DestroyQueue(WDDMQueue *queue); + bool CreateHwQueue(WDDMQueue *queue); + bool DestroyHwQueue(WDDMQueue *queue); + bool SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value); + bool SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value); + + bool WaitPagingFence(WDDMQueue *queue) { + uint64_t value = page_fence_value_; + + if (*page_fence_addr_ < value && + !GpuWait(queue, &page_syncobj_, &value, 1)) + return false; + + return true; + } + + bool GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs, + uint64_t *values, int count); + bool GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs, + uint64_t *value, int count); + bool CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value, + int count, bool wait_any); + bool WaitOnPagingFenceFromCpu(); + + uint32_t LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt); + uint32_t GetCmdbufSize(void) const { return cmdbuf_size_; } + uint32_t GetAqlFrameSize(void) const { return cmdbuf_aql_frame_size_; } + static uint32_t GetAqlFrameNum(void) { return cmdbuf_aql_frame_num_; } + + // Both legacy HWS and stage 1 HWS use KMD to alloc use queue memory, + // return false by default + bool AllocUserQueueMemFromUMD(void) const { return false; } + + bool IsHwsEnabled(int engine) { + return thunk_proxy::GetHwsEnabled(engine, &device_info_); + } + + void UpdatePageFence(uint64_t fence_value); + + D3DKMT_HANDLE PagingQueue() const { return page_queue_; } + D3DKMT_HANDLE PagingFence() const { return page_syncobj_; } + D3DKMT_HANDLE DeviceHandle() const { return device_; } + LUID GetLuid() const { return adapter_luid_; } + D3DKMT_HANDLE GetAdapter() const { return adapter_; } + + const thunk_proxy::DeviceInfo& DeviceInfo() const { return device_info_; } + + ErrorCode CreateGpuMemory(const GpuMemoryCreateInfo &create_info, GpuMemory **gpu_mem, gpusize *gpu_va = nullptr); + +private: + bool ParseDeviceInfo(void); + void DestroyDeviceInfo(void); + bool CreateDevice(void); + bool DestroyDevice(void); + bool CreatePagingQueue(void); + bool DestroyPagingQueue(void); + void *Lock(D3DKMT_HANDLE handle); + bool Unlock(D3DKMT_HANDLE handle); + bool CreateContext(int engine, D3DKMT_HANDLE *handle); + bool DestroyContext(D3DKMT_HANDLE handle); + + void SetPowerOptimization(bool restore); + void InitCmdbufInfo(void); + + bool QuerySegmentInfo(); + bool GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE segment_type, uint32_t &segment_id); + + D3DKMT_HANDLE adapter_; + LUID adapter_luid_; + D3DKMT_HANDLE device_; + + D3DKMT_HANDLE page_queue_; + D3DKMT_HANDLE page_syncobj_; + uint64_t *page_fence_addr_; + std::atomic page_fence_value_; + + uint32_t cmdbuf_size_; + uint32_t cmdbuf_aql_frame_size_; + static const uint32_t cmdbuf_aql_frame_num_; + uint32_t node_id_; + // device info + thunk_proxy::DeviceInfo device_info_; + std::vector segment_infos_; + //CmdUtil cmd_util; +}; + +NTSTATUS WDDMCreateDevices(std::vector &devices); + +} // namespace thunk +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/gpu_memory.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/gpu_memory.h new file mode 100644 index 0000000000..9703a6d2c7 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/gpu_memory.h @@ -0,0 +1,249 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_GPU_MEMORY_H_ +#define _WSL_INC_WDDM_GPU_MEMORY_H_ + +#include +#include +#include "util/utils.h" +#include "impl/wddm/types.h" +#include "impl/wddm/thunks.h" +#include "impl/thunk_proxy/thunk_proxy.h" + +namespace wsl { +namespace thunk { + +class WDDMDevice; + +union GpuMemoryCreateFlags { + struct { + uint64_t virtual_alloc : 1; // only allocate virtual address, without physical buffer + uint64_t physical_only : 1; // only allocate physical buffer, without virutal address + uint64_t interprocess : 1; // physical buffer need share info between exporter and importer + uint64_t locked : 1; // lock virtual address space into RAM, preventing that memory from being paged to the swap area + uint64_t physical_contiguous : 1; // contiguous physical pages + uint64_t sysmem_ipc_sig_importer : 1; // allocate system memory for IPC signal + uint64_t sysmem_ipc_sig_exporter : 1; // allocate system memory for IPC signal, prepare to export + uint64_t alloc_va : 1; // allocate va. 0 for vmem import + uint64_t blit_kernel_object : 1; // allocate executable blit kernel object + uint64_t unused : 55; + }; + uint64_t reserved; +}; + +union GpuMemoryDescFlags { + struct { + uint32_t is_virtual : 1; + uint32_t is_shared : 1; + uint32_t is_external : 1; + uint32_t is_physical_only : 1; + uint32_t is_locked : 1; + uint32_t is_queue_referenced : 1; + uint32_t is_physical_contiguous : 1; + uint32_t is_imported_sys_memfd : 1; // 0 - ignored; 1 - va from system heap + uint32_t is_sysmem_exporter : 1; // allocate system memory for IPC signal, prepare to export + uint32_t is_va_required :1; + uint32_t is_imported_vram_vmem :1; + uint32_t is_imported_vram_ipc :1; + uint32_t is_imported_from_same_process : 3; // imported from same process, record shared cnt + uint32_t is_blit_kernel_object : 1; // blit kernel object + uint32_t unused : 16; + }; + + uint32_t reserved; +}; + +struct GpuMemoryCreateInfo { + GpuMemoryCreateInfo() { + flags.reserved = 0; + domain = thunk_proxy::kLocal; + size = 0; + alignment = 0; + mem_flags = 0; + engine_flag = 0; + va_hint = 0; + user_ptr = nullptr; + dmabuf_fd = -1; + } + + GpuMemoryCreateFlags flags; + thunk_proxy::AllocDomain domain; + gpusize size; + gpusize alignment; + int mem_flags; + int engine_flag; + int dmabuf_fd; // Import from dmabuf + + void *user_ptr; + gpusize va_hint; +}; + +struct GpuMemoryDesc { + GpuMemoryDesc() { + gpu_addr = 0; + cpu_addr = nullptr; + client_size = 0; + size = alignment = 0; + flags.reserved = 0; + mem_flags = 0; + engine_flag = 0; + handle_ape_addr = 0; + } + + thunk_proxy::AllocDomain domain; + LUID adapter_luid; // Where is the backing store location + gpusize gpu_addr; + void *cpu_addr; + gpusize client_size; // user request size + gpusize size; + gpusize alignment; + gpusize handle_ape_addr; + + GpuMemoryDescFlags flags; + int mem_flags; + int engine_flag; +}; + +struct SharedHandleInfo { + thunk_proxy::AllocDomain domain; + LUID adapter_luid; + gpusize client_size; // user request size + uint64_t size; + uint32_t flags; + int mem_flags; + pid_t pid; + gpusize gpu_addr; +}; + +using GpuMemoryHandle = void *; + +class GpuMemory { +public: + static size_t CalcChunkNumbers(gpusize size); + + ErrorCode Init(const GpuMemoryCreateInfo &create_info); + + WDDMDevice *GetDevice() const { return device_; } + gpusize Size() const { return desc_.size; } + gpusize ClientSize() const { return desc_.client_size; } + uint64_t GpuAddress() const { return desc_.gpu_addr; } + void *CpuAddress() const { return desc_.cpu_addr; } + uint64_t HandleApeAddress() const { return desc_.handle_ape_addr; } + + inline bool IsLocal() const { return desc_.domain == thunk_proxy::kLocal; } + inline bool IsUserMemory() const { return desc_.domain == thunk_proxy::kUserMemory; } + inline bool IsSystem() const { return desc_.domain == thunk_proxy::kSystem; } + inline bool IsSysMemFd() const { return desc_.flags.is_imported_sys_memfd; } + inline bool IsUserQueue() const { return desc_.domain == thunk_proxy::kUserQueue; } + inline bool IsPhysicalOnly() const { return desc_.flags.is_physical_only; } + inline bool IsPhysicalContiguous() const { return desc_.flags.is_physical_contiguous; } + inline bool IsVirtual() const { return desc_.flags.is_virtual; } + inline bool IsShared() const { return desc_.flags.is_shared; } + inline bool IsExternal() const { return desc_.flags.is_external; } + inline bool IsVaAllocated() const { return desc_.flags.is_va_required; } + inline bool IsBlitKernelObject() const { return desc_.flags.is_blit_kernel_object; } + + inline uint32_t Flags() const { return desc_.flags.reserved; } + inline int GetAllocInfo() const { return desc_.mem_flags; } + inline bool IsFineGrain() const { return (desc_.mem_flags & thunk_proxy::kFineGrain); } + inline bool IsSameAdapter(const LUID &luid) const { + return (desc_.adapter_luid.HighPart == luid.HighPart && + desc_.adapter_luid.LowPart == luid.LowPart); + } + inline void GetQueueReference() { desc_.flags.is_queue_referenced = 1; } + inline void PutQueueReference() { desc_.flags.is_queue_referenced = 0; } + inline bool IsQueueReferenced() const { return desc_.flags.is_queue_referenced; } + inline void IncSharedReference() { desc_.flags.is_imported_from_same_process++; } + inline uint32_t DecSharedReference() { return (desc_.flags.is_imported_from_same_process == 0) ? 0 : --desc_.flags.is_imported_from_same_process; } + inline bool IsSharedFromSameProcess() const { return desc_.flags.is_imported_from_same_process > 0; } + + WinAllocationHandle GetAllocationHandle(size_t index) const { return alloc_handles_ptr_[index]; } + size_t NumChunks() const { return num_allocations_; } + + const GpuMemoryHandle GetGpuMemoryHandle() const { + return reinterpret_cast(const_cast(this)); + } + + static GpuMemory *Convert(GpuMemoryHandle handle) { return reinterpret_cast(handle); } + + ErrorCode ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize va_size, gpusize alignment); + ErrorCode FreeGpuVirtualAddress(gpusize va_start_address, gpusize va_size); + + ErrorCode MapGpuVirtualAddress(const gpusize map_addr, const gpusize size, gpusize offset = 0); + ErrorCode UnmapGpuVirtualAddress(const gpusize map_addr, const gpusize size, gpusize offset = 0); + + ErrorCode MakeResident(); + ErrorCode Evict(); + + ErrorCode ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags = SHARED_ALLOCATION_ALL_ACCESS); + ErrorCode ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info, gpusize *gpu_addr = nullptr); + ~GpuMemory(); +protected: + explicit GpuMemory(WDDMDevice *device); +private: + ErrorCode CreatePhysicalMemory(); + ErrorCode FreePhysicalMemory(); + + uint64_t AdjustSize(gpusize size) const; +private: + friend class WDDMDevice; + + WDDMDevice *const device_; + + GpuMemoryDesc desc_; + + size_t num_allocations_; + WinAllocationHandle *alloc_handles_ptr_; + WinAllocationHandle alloc_handle_; // Optimization for num_allocations_ is 1 + + WinResourceHandle resource_; // Handle to a resource object that wraps the allocation. Used for shared resources + + int mem_fd_; // IPC sigal's sys mem fd + + DISALLOW_COPY_AND_ASSIGN(GpuMemory); +}; + +} // namespace thunk +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/queue.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/queue.h new file mode 100644 index 0000000000..0e936c5721 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/queue.h @@ -0,0 +1,370 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// +#ifndef _WSL_INC_WDDM_QUEUE_H_ +#define _WSL_INC_WDDM_QUEUE_H_ + +#include +#include +#include +#include +#include +#include "impl/wddm/types.h" +#include "impl/wddm/device.h" +#include "impl/wddm/gpu_memory.h" +#include "impl/hsa/hsa_ext_amd.h" +#include "impl/hsa/amd_hsa_queue.h" +#include "impl/hsa/amd_hsa_signal.h" +#include "impl/wddm/cmd_util.h" + +namespace wsl { +namespace thunk { + +class Queue; +class WDDMDevice; + +class WDDMQueue { +public: + WDDMQueue(WDDMDevice *device, + uint64_t cmdbuf_addr, + uint32_t cmdbuf_size, + uint32_t engine, + bool use_hws = true) : + device(device), + context(0), + queue(0), + syncobj(0), + sync_addr(NULL), + cmdbuf(0), + cmdbuf_addr(cmdbuf_addr), + cmdbuf_size(cmdbuf_size), + queue_engine(engine), + use_hws(use_hws), + prio(thunk_proxy::kNormal) { + + } + + virtual ~WDDMQueue() { } + + virtual hsa_status_t Init(void) { return HSA_STATUS_SUCCESS; } + virtual hsa_status_t Fini(void) { return HSA_STATUS_SUCCESS; } + virtual void RingDoorbell() { } + virtual void* GetHsaQueueAddr(void) const { return reinterpret_cast(GetCmdbufAddr()); } + + hsa_status_t SwsInit(void); + hsa_status_t SwsFini(void); + hsa_status_t SwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value); + + hsa_status_t HwsInit(void); + hsa_status_t HwsFini(void); + hsa_status_t HwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value); + hsa_status_t SetPriority(hsa_amd_queue_priority_t priority); + + uint64_t *GetSyncAddr(void) const { return sync_addr; } + uint64_t GetCmdbufAddr(void) const { return cmdbuf_addr; } + + thunk_proxy::SchedLevel ConvertSchedLevel(hsa_amd_queue_priority_t prio) const { + switch (prio) { + case HSA_AMD_QUEUE_PRIORITY_LOW: + return thunk_proxy::kLow; + case HSA_AMD_QUEUE_PRIORITY_HIGH: + return thunk_proxy::kHigh; + case HSA_AMD_QUEUE_PRIORITY_NORMAL: + default: + return thunk_proxy::kNormal; + } + } + + WDDMDevice *device; + + D3DKMT_HANDLE context; + D3DKMT_HANDLE queue; + + D3DKMT_HANDLE syncobj; + uint64_t *sync_addr; + + GpuMemoryHandle cmdbuf; + uint64_t cmdbuf_addr; + uint32_t cmdbuf_size; + + GpuMemoryHandle queue_mem; + uint64_t queue_addr; + + uint32_t queue_engine; + + bool use_hws; + thunk_proxy::SchedLevel prio; +}; + +class ComputeQueue : public WDDMQueue { +public: + ComputeQueue(WDDMDevice *device, + void *ring, + uint64_t ring_size, + std::atomic *ring_wptr, + std::atomic *ring_rptr, + volatile int64_t *error_addr, + uint32_t cmdbuf_size, + uint32_t engine, + bool use_hws = true); + + ~ComputeQueue(); + + virtual hsa_status_t Init(void); + virtual hsa_status_t Fini(void); + virtual hsa_status_t Submit(void); + + void* GetRing(void) const { return ring; } + uint64_t GetRingSize(void) const { return ring_size; } + std::atomic* GetRingWptr(void) const { return ring_wptr; } + std::atomic* GetRingRptr(void) const { return ring_rptr; } + + uint64_t GetAqlWriteIndex(void) const { return cmdbuf_aql_frame_write_index; } + uint32_t GetAqlFrameSize(void) const { return cmdbuf_aql_frame_size; } + void* GetHsaQueueAddr(void) const { return ring; } + + bool IsInvalidPacket(void) const { + uint16_t *packet = (uint16_t *)((char *)ring + + (cmdbuf_aql_frame_write_index % ring_size) * 64); + return ((*packet >> HSA_PACKET_HEADER_TYPE) & ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1)) + == HSA_PACKET_TYPE_INVALID; + } + + hsa_status_t Process(void); + uint64_t * GetDoorbellPtr() const { return (uint64_t *)&doorbell_signal_value_; } + void RingDoorbell(); +private: + hsa_status_t KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet); + hsa_status_t BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or = false); + + uint64_t CalcDispatchGroups(hsa_kernel_dispatch_packet_t *packet); + uint64_t CalcDispatchWavesPerGroup(hsa_kernel_dispatch_packet_t *packet, bool wave32); + + struct amd_aql_pm4_ib { + uint16_t header; + uint16_t ven_hdr; + uint32_t ib_jump_cmd[4]; + uint32_t dw_cnt_remain; + uint32_t reserved[8]; + hsa_signal_t completion_signal; + }; + hsa_status_t VendorSpecificAqlToPm4(char *cpu, amd_aql_pm4_ib *packet); + hsa_status_t SwitchAql2PM4(void); + + hsa_status_t PreSubmit(void); + hsa_status_t EndSubmit(void); + + void *ring; + uint64_t ring_size; + std::atomic *ring_wptr; + std::atomic *ring_rptr; + + // ib_start_addr is the current ib start address + uint64_t ib_start_addr; + + // ib_size is the current ib size. + uint64_t ib_size; + + // record the last submitted aql frame write index + uint64_t sync_point; + + uint64_t cmdbuf_aql_frame_write_index; + uint32_t cmdbuf_aql_frame_size; + + uint64_t *signal_addr_; + bool platform_atomic_support_; + bool needs_barrier; + bool ready_to_submit; + + CmdUtil cmd_util; + +private: + bool EnableProfiling() { + return AMD_HSA_BITS_GET(amd_queue_rocr_->queue_properties, AMD_QUEUE_PROPERTIES_ENABLE_PROFILING); + } + void HandleError(hsa_status_t status); + bool UpdateScratch(hsa_kernel_dispatch_packet_t *packet, bool wave32); + + uint32_t UpdateIndexStride(uint32_t srd, bool wave32); + + void *ScratchBase() { return scratch_base_; } + + void AppendCmdbufSratchBaseOffset(int offset) { + scratch_base_offset_array_.push_back(offset); + } + + bool RelocateCmdbufScratchBase(uint64_t addr); + + uint32_t ScratchSizePerWave() { return scratch_size_per_wave_; } + uint64_t GetKernelObjAddr(uint64_t addr) const; + void InitScratchSRD(); + GpuMemoryHandle amd_queue_mem_; + amd_queue_v2_t *amd_queue_; + amd_queue_v2_t *amd_queue_rocr_; + uint64_t doorbell_signal_value_; + volatile std::atomic *error_code_; + std::thread aql_to_pm4_thread_; + bool thread_stop_; + std::mutex thread_cond_lock_; + std::condition_variable thread_cond_; + static void AqlToPm4Thread(ComputeQueue *queue); + + uint64_t max_scratch_waves_; + uint64_t dispatch_waves_; + uint64_t scratch_size_per_wave_; + uint64_t scratch_size_; + uint64_t total_scratch_size_; + void *scratch_base_; + uint32_t scratch_mem_alignment_size_; + GpuMemoryHandle scratch_mem_; + + std::vector scratch_base_offset_array_; +}; + +class SDMAQueue : public WDDMQueue { +public: + SDMAQueue(WDDMDevice *device, + void *ring, + uint64_t cmdbuf_size, + uint32_t engine, + bool use_hws = true); + + virtual ~SDMAQueue(); + + hsa_status_t Init(void); + hsa_status_t Fini(void); + hsa_status_t Submit(void); + + int PreparePacket(uint32_t offset, uint64_t size); + + void WaitQueue(void) { + device->CpuWait(&syncobj, &rptr_next, 1, false); + } + + uint64_t * GetRingWptr(void) { return &wptr_next_; } + uint64_t * GetRingRptr(void) { return WDDMQueue::GetSyncAddr(); } + uint64_t * GetDoorbellPtr() { return &doorbell_; } + void RingDoorbell(); + void* GetHsaQueueAddr(void) const { return reinterpret_cast(GetCmdbufAddr()); } + +private: + uint64_t wptr_next_; + uint64_t wptr_pre_; + uint64_t rptr_next; + uint64_t doorbell_; + std::vector> wptr_queue_; + uint64_t ib_size; + uint64_t ib_start_addr; + + std::thread thread_; + bool thread_stop_; + std::mutex thread_cond_lock_; + std::condition_variable thread_cond_; + static void SdmaThread(SDMAQueue *queue); + + struct SDMA_PKT_POLL_REGMEM { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved_0 : 10; + unsigned int hdp_flush : 1; + unsigned int reserved_1 : 1; + unsigned int func : 3; + unsigned int mem_poll : 1; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } ADDR_LO_UNION; + + union { + struct { + unsigned int addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } ADDR_HI_UNION; + + union { + struct { + unsigned int value : 32; + }; + unsigned int DW_3_DATA; + } VALUE_UNION; + + union { + struct { + unsigned int mask : 32; + }; + unsigned int DW_4_DATA; + } MASK_UNION; + + union { + struct { + unsigned int interval : 16; + unsigned int retry_count : 12; + unsigned int reserved_0 : 4; + }; + unsigned int DW_5_DATA; + } DW5_UNION; + }; + const unsigned int SDMA_OP_POLL_REGMEM = 8; + bool IsPollPacket(SDMA_PKT_POLL_REGMEM* pkt) { + return pkt->HEADER_UNION.op == SDMA_OP_POLL_REGMEM && + pkt->HEADER_UNION.mem_poll == 1 && + pkt->HEADER_UNION.func == 3; + } + uint32_t WrapIntoRocrRing(uint64_t idx) { return (idx & (cmdbuf_size - 1)); } +}; + +} // namespace thunk +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/status.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/status.h new file mode 100644 index 0000000000..0efd9559fd --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/status.h @@ -0,0 +1,61 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_STATUS_H +#define _WSL_INC_WDDM_STATUS_H + +enum class ErrorCode { + Success, + DeviceLost, + UnSupported, + NotReady, + OutOfMemory, + OutOfGpuMemory, + OutOfHandleApeMemory, + Timeout, + SyscallFail, + InvalidateParams, + SameProcessSameDevice, + Unknown, +}; + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/thunks.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/thunks.h new file mode 100644 index 0000000000..68f0015d6d --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/thunks.h @@ -0,0 +1,233 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_THUNKS_H +#define _WSL_INC_WDDM_THUNKS_H + +#include "impl/wddm/status.h" +#include "impl/wddm/types.h" +#include "dxcore_loader.h" + +namespace wsl { +namespace thunk { + +inline ErrorCode TranslateNtStatus(NTSTATUS status) { + switch (status) { + case STATUS_SUCCESS: + return ErrorCode::Success; + case STATUS_PENDING: + return ErrorCode::NotReady; + case STATUS_NO_MEMORY: + return ErrorCode::OutOfMemory; + case STATUS_DEVICE_REMOVED: + return ErrorCode::DeviceLost; + case STATUS_GRAPHICS_NO_VIDEO_MEMORY: + return ErrorCode::OutOfGpuMemory; + case STATUS_TIMEOUT: + return ErrorCode::Timeout; + case STATUS_INVALID_PARAMETER: + return ErrorCode::InvalidateParams; + default: + break; + } + return ErrorCode::Unknown; +} + +namespace d3dthunk { + +typedef D3DKMT_CREATEALLOCATION CreateAllocationArgs; +typedef D3DKMT_CREATECONTEXT CreateContextArgs; +typedef D3DKMT_CREATECONTEXTVIRTUAL CreateContextVirtualArgs; +typedef D3DKMT_CREATEPAGINGQUEUE CreatePagingQueueArgs; +typedef D3DKMT_CREATESYNCHRONIZATIONOBJECT CreateSynchronizationObjectArgs; +typedef D3DKMT_CREATESYNCHRONIZATIONOBJECT2 CreateSynchronizationObject2Args; +typedef D3DKMT_ESCAPE EscapeArgs; +typedef D3DKMT_EVICT EvictArgs; +typedef D3DKMT_FREEGPUVIRTUALADDRESS FreeGpuVirtualAddressArgs; +typedef D3DKMT_LOCK LockArgs; +typedef D3DKMT_LOCK2 Lock2Args; +typedef D3DKMT_OPENRESOURCE OpenResourceArgs; +typedef D3DKMT_OPENRESOURCEFROMNTHANDLE OpenResourceFromNtHandleArgs; +typedef D3DKMT_QUERYADAPTERINFO QueryAdapterInfoArgs; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECT SignalSynchronizationObjectArgs; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECT2 SignalSynchronizationObject2Args; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMCPU SignalSynchronizationObjectFromCpuArgs; +typedef D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU2 SignalSynchronizationObjectFromGpuArgs; +typedef D3DKMT_SUBMITCOMMAND SubmitCommandArgs; +typedef D3DKMT_UNLOCK UnlockArgs; +typedef D3DKMT_UNLOCK2 Unlock2Args; +typedef D3DKMT_UPDATEGPUVIRTUALADDRESS UpdateGpuVirtualAddressArgs; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECT WaitForSynchronizationObjectArgs; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECT2 WaitForSynchronizationObject2Args; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU WaitForSynchronizationObjectFromCpuArgs; +typedef D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU WaitForSynchronizationObjectFromGpuArgs; +typedef D3DKMT_ACQUIREKEYEDMUTEX AcquireKeyedMutexArgs; +typedef D3DKMT_RELEASEKEYEDMUTEX ReleaseKeyedMutexArgs; +typedef D3DKMT_OPENKEYEDMUTEX OpenKeyedMutexArgs; +typedef D3DKMT_DESTROYKEYEDMUTEX DestroyKeyedMutexArgs; +typedef D3DKMT_QUERYVIDEOMEMORYINFO QueryVideoMemoryInfoArgs; +typedef D3DKMT_CREATEHWQUEUE CreateHwQueueArgs; +typedef D3DKMT_DESTROYHWQUEUE DestroyHwQueueArgs; +typedef D3DKMT_SUBMITCOMMANDTOHWQUEUE SubmitCommandToHwQueueArgs; +typedef D3DKMT_SUBMITPRESENTTOHWQUEUE SubmitPresentToHwQueueArgs; +typedef D3DKMT_SUBMITSIGNALSYNCOBJECTSTOHWQUEUE SubmitSignalSyncObjectsToHwQueueArgs; +typedef D3DKMT_SUBMITWAITFORSYNCOBJECTSTOHWQUEUE SubmitWaitForSyncObjectsToHwQueueArgs; +typedef D3DKMT_CREATESYNCFILE CreateSyncFileArgs; + +inline ErrorCode MapGpuVirtualAddress(D3DDDI_MAPGPUVIRTUALADDRESS *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTMapGpuVirtualAddress(args))); +} + +inline ErrorCode CreateAllocation(CreateAllocationArgs *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTCreateAllocation2(args))); +} + +inline ErrorCode DestroyAllocation( + WinDeviceHandle device, + WinResourceHandle resource, + size_t num_allocations, + const WinAllocationHandle *alloc_handles) { + + D3DKMT_DESTROYALLOCATION2 args{}; + + memset(&args, 0, sizeof(args)); + args.hDevice = device; + if (resource) { + args.hResource = resource; + } else { + args.phAllocationList = alloc_handles; + args.AllocationCount = num_allocations; + } + + return TranslateNtStatus(DXCORE_CALL(D3DKMTDestroyAllocation2(&args))); +} + +inline ErrorCode ReserveGpuVirtualAddress(D3DDDI_RESERVEGPUVIRTUALADDRESS *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTReserveGpuVirtualAddress(args))); +} + +inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle, + gpusize size, + gpusize base_address, + gpusize *out_addr) { + D3DDDI_RESERVEGPUVIRTUALADDRESS args{}; + args.hPagingQueue = handle; + args.Size = size; + args.BaseAddress = base_address; + + auto code = ReserveGpuVirtualAddress(&args); + if (code == ErrorCode::Success) + *out_addr = args.VirtualAddress; + return code; +} + +inline ErrorCode ReserveGpuVirtualAddress(WinAdapterHandle handle, + gpusize size, + gpusize minimum_address, + gpusize maximum_address, + gpusize *out_addr) { + D3DDDI_RESERVEGPUVIRTUALADDRESS args{}; + args.hPagingQueue = handle; + args.Size = size; + args.MinimumAddress = minimum_address; + args.MaximumAddress = maximum_address; + + auto code = ReserveGpuVirtualAddress(&args); + if (code == ErrorCode::Success) + *out_addr = args.VirtualAddress; + return code; +} + +inline ErrorCode FreeGpuVirtualAddress(FreeGpuVirtualAddressArgs *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTFreeGpuVirtualAddress(args))); +} + +inline ErrorCode FreeGpuVirtualAddress(WinAdapterHandle handle, + gpusize base_address, + gpusize size) { + FreeGpuVirtualAddressArgs args{}; + args.hAdapter = handle; + args.Size = size; + args.BaseAddress = base_address; + return FreeGpuVirtualAddress(&args); +} + +inline ErrorCode MakeResident(D3DDDI_MAKERESIDENT *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTMakeResident(args))); +} + +inline ErrorCode Evict(EvictArgs *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTEvict(args))); +} + +inline ErrorCode ShareObjects(size_t num_allocations, + WinResourceHandle resource, + uint32_t flags, + int* dmabuf_fd) { + OBJECT_ATTRIBUTES obj_attr; + HANDLE nt_handle; + ErrorCode ret; + + InitializeObjectAttributes(&obj_attr, nullptr, OBJ_INHERIT, nullptr, nullptr); + ret = TranslateNtStatus(DXCORE_CALL(D3DKMTShareObjects(num_allocations, + &resource, &obj_attr, flags, &nt_handle))); + if (ret == ErrorCode::Success) + *dmabuf_fd = *(reinterpret_cast(&nt_handle)); + else + *dmabuf_fd = -1; + + return ret; +} + +inline ErrorCode QueryResourceInfoFromNtHandle(D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTQueryResourceInfoFromNtHandle(args))); +} + +inline ErrorCode OpenResourceFromNtHandle(D3DKMT_OPENRESOURCEFROMNTHANDLE *args) { + return TranslateNtStatus(DXCORE_CALL(D3DKMTOpenResourceFromNtHandle(args))); +} + +} // namespace d3dthunk +} // namespace thunk +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/types.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/types.h new file mode 100644 index 0000000000..0a3ca35ebc --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/types.h @@ -0,0 +1,101 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _WSL_INC_WDDM_TYPES_H_ +#define _WSL_INC_WDDM_TYPES_H_ + +#include +#include +#include "impl/thunk_proxy/wddm_types.h" +// windows wchar is 16bit, but linux is 32bit +// seems libdxcore (not dxgkrnl.ko) convert thunk windows wchar to linux one +// so only accept 32bit wchar args. note driver private data structure still +// use 16bit wchar +#define WCHAR wchar_t +#define PCWSTR const wchar_t * +#include +#undef WCHAR +#undef PCWSTR + +using gpusize = uint64_t; // Used to specify GPU addresses and sizes of GPU allocations +using WinAllocationHandle = D3DKMT_HANDLE; +using WinResourceHandle = D3DKMT_HANDLE; +using WinContextHandle = D3DKMT_HANDLE; +using WinDeviceHandle = D3DKMT_HANDLE; +using WinAdapterHandle = D3DKMT_HANDLE; + +//reference dk/winnt.h +#define STANDARD_RIGHTS_REQUIRED (0x000F0000L) + +//reference dk/ntdef.h +#define OBJ_INHERIT (0x00000002L) +typedef WCHAR *PWCHAR, *LPWCH, *PWCH; +typedef struct _UNICODE_STRING { + USHORT Length; + USHORT MaximumLength; +#ifdef MIDL_PASS + [size_is(MaximumLength / 2), length_is((Length) / 2) ] USHORT * Buffer; +#else // MIDL_PASS + _Field_size_bytes_part_opt_(MaximumLength, Length) PWCH Buffer; +#endif // MIDL_PASS +} UNICODE_STRING; +typedef UNICODE_STRING *PUNICODE_STRING; +typedef const UNICODE_STRING *PCUNICODE_STRING; + +typedef struct _OBJECT_ATTRIBUTES { + ULONG Length; + HANDLE RootDirectory; + PUNICODE_STRING ObjectName; + ULONG Attributes; + PVOID SecurityDescriptor; + PVOID SecurityQualityOfService; +} OBJECT_ATTRIBUTES; +#define InitializeObjectAttributes( p, n, a, r, s ) { \ + (p)->Length = sizeof( OBJECT_ATTRIBUTES ); \ + (p)->RootDirectory = r; \ + (p)->Attributes = a; \ + (p)->ObjectName = n; \ + (p)->SecurityDescriptor = s; \ + (p)->SecurityQualityOfService = NULL; \ + } + +#endif \ No newline at end of file diff --git a/projects/rocr-runtime/libhsakmt/include/impl/wddm/va_mgr.h b/projects/rocr-runtime/libhsakmt/include/impl/wddm/va_mgr.h new file mode 100644 index 0000000000..675bfc3e39 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/include/impl/wddm/va_mgr.h @@ -0,0 +1,86 @@ +#ifndef _WSL_INC_WDDM_VA_MGR_H_ +#define _WSL_INC_WDDM_VA_MGR_H_ + +#include +#include +#include "util/utils.h" + +namespace wsl { +namespace thunk { + +class VaMgr { +public: + VaMgr(uint64_t start, uint64_t size, uint64_t min_align); + ~VaMgr(); + + /* Allocate `bytes` VA, if `align` is not zero, the returned address is aligned by `align`. + * If `addr` parameter is not zero, try best to allocate VA from fixed address `addr`. + */ + uint64_t Alloc(uint64_t bytes, uint64_t align, uint64_t addr = 0); + + void Free(uint64_t addr); + +private: + uint64_t AllocImpl(uint64_t bytes, uint64_t align); + + struct Fragment { + using ptr = std::multimap::iterator; + ptr free_list_entry_; + + struct { + uint64_t size : 63; + bool is_free : 1; + }; + + Fragment() : size(0), is_free(false) {} + Fragment(ptr iterator, uint64_t len, bool is_free) + : free_list_entry_(iterator), size(len), is_free(is_free) {} + }; + + static inline Fragment make_fragment(typename Fragment::ptr iter, uint64_t len) { + return {iter, len, true}; + } + + inline Fragment make_fragment(uint64_t len) { return {free_list_.end(), len, false}; } + + static inline bool is_free(const Fragment& f) { return f.is_free; } + void set_used(Fragment& f) { + f.is_free = false; + f.free_list_entry_ = free_list_.end(); + } + static void set_free(Fragment& f, typename Fragment::ptr iter) { + f.free_list_entry_ = iter; + f.is_free = true; + } + + inline void remove_free_list_entry(Fragment& frag) { + if (frag.free_list_entry_ != free_list_.end()) { + free_list_.erase(frag.free_list_entry_); + frag.free_list_entry_ = free_list_.end(); + } + } + + inline void add_free_fragment(uint64_t size, uint64_t base) { + auto it = free_list_.insert(std::make_pair(size, base)); + frag_map_[base] = make_fragment(it, size); + } + + inline void add_used_fragment(uint64_t size, uint64_t base) { + frag_map_[base] = make_fragment(size); + } + // Indexed by size + std::multimap free_list_; + // Indexed by VA, each fragment has no overlap + std::map frag_map_; + + uint64_t min_align_; + + std::mutex lock_; // Mutex protecting allocation and free of va + + + DISALLOW_COPY_AND_ASSIGN(VaMgr); +}; + +} // namespace thunk +} // namespace wsl +#endif diff --git a/projects/rocr-runtime/libhsakmt/librocdxg.pc.in b/projects/rocr-runtime/libhsakmt/librocdxg.pc.in new file mode 100755 index 0000000000..d9b362399d --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/librocdxg.pc.in @@ -0,0 +1,11 @@ +prefix=${pcfiledir}/../../.. +exec_prefix=${prefix} +libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@ +includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ + +Name: librocdxg +Description: HSA Kernel Mode Thunk library for WSL support +Version: @LIB_VERSION_STRING@ + +Libs: -L${libdir} -lrocdxg +Cflags: -I${includedir} diff --git a/projects/rocr-runtime/libhsakmt/rocdxg-config.cmake.in b/projects/rocr-runtime/libhsakmt/rocdxg-config.cmake.in new file mode 100644 index 0000000000..5f2ab41f37 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/rocdxg-config.cmake.in @@ -0,0 +1,14 @@ +@PACKAGE_INIT@ + +include( CMakeFindDependencyMacro ) + +# Locate dependent packages here. Finding them propagates usage requirements, +# if any, to our clients and ensures that their target names are in scope for +# the build. rocdxg has no cmake project dependencies so there is nothing to +# find. If we switch to use find_package with external (to ROCm) library +# dependencies (ie libnuma) then those packages should be located here using +# find_dependencies as shown below. +#find_dependency(Bar, 2.0) + +include( "${CMAKE_CURRENT_LIST_DIR}/@ROCDXG_TARGET@Targets.cmake" ) + diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/ais.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/ais.cpp new file mode 100644 index 0000000000..e32c28b1d4 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/ais.cpp @@ -0,0 +1,39 @@ +/* + * Copyright © 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +HSAKMT_STATUS HSAKMTAPI hsaKmtAisReadWriteFile(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + HSAint32 fd, + HSAint64 file_offset, + HsaAisFlags AisFlags, + HSAuint64 *SizeCopiedInBytes, + HSAint32 *status) +{ + CHECK_DXG_OPEN(); + + pr_warn_once("not implemented\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/debug.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/debug.cpp new file mode 100644 index 0000000000..2b4425599a --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/debug.cpp @@ -0,0 +1,126 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include + + +static uint32_t runtime_capabilities_mask = 0; + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgUnregister(HSAuint32 NodeId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgWavefrontControl( + HSAuint32 NodeId, HSA_DBG_WAVEOP Operand, HSA_DBG_WAVEMODE Mode, + HSAuint32 TrapId, HsaDbgWaveMessage *DbgWaveMsgRing) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch( + HSAuint32 NodeId, HSAuint32 NumWatchPoints, HSA_DBG_WATCH_MODE WatchMode[], + void *WatchAddress[], HSAuint64 WatchMask[], HsaEvent *WatchEvent[]) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug, bool setupTtmp) { + HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport(); + + if (result) + return result; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void) { + HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport(); + + if (result) + return HSAKMT_STATUS_SUCCESS; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilities(HSAuint32 *caps_mask) { + CHECK_DXG_OPEN(); + *caps_mask = runtime_capabilities_mask; + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info, + HSAuint32 *data_size) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data, + HSAuint32 *n_entries, + HSAuint32 *entry_size) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data, HSAuint32 *n_entries, + HSAuint32 *entry_size, + bool suspend_queues) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *args, HSA_QUEUEID *Queues, + HSAuint64 *DebugReturn) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.cpp new file mode 100644 index 0000000000..5d38d69c8d --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + */ + +#include "dxcore_loader.h" +#include "librocdxg.h" +#include +#include +#include +#include + +namespace wsl { +namespace thunk { +namespace dxcore { + +DxcoreLoader::DxcoreLoader() + : dxcore_handle_(nullptr) + , init_flag_() + , pfn_D3DKMTCreateAllocation2(nullptr) + , pfn_D3DKMTDestroyAllocation2(nullptr) + , pfn_D3DKMTMapGpuVirtualAddress(nullptr) + , pfn_D3DKMTReserveGpuVirtualAddress(nullptr) + , pfn_D3DKMTFreeGpuVirtualAddress(nullptr) + , pfn_D3DKMTCreateDevice(nullptr) + , pfn_D3DKMTDestroyDevice(nullptr) + , pfn_D3DKMTEnumAdapters2(nullptr) + , pfn_D3DKMTQueryAdapterInfo(nullptr) + , pfn_D3DKMTCreateContextVirtual(nullptr) + , pfn_D3DKMTDestroyContext(nullptr) + , pfn_D3DKMTSubmitCommand(nullptr) + , pfn_D3DKMTCreateSynchronizationObject2(nullptr) + , pfn_D3DKMTDestroySynchronizationObject(nullptr) + , pfn_D3DKMTQueryStatistics(nullptr) + , pfn_D3DKMTEscape(nullptr) + , pfn_D3DKMTLock2(nullptr) + , pfn_D3DKMTUnlock2(nullptr) + , pfn_D3DKMTCreatePagingQueue(nullptr) + , pfn_D3DKMTDestroyPagingQueue(nullptr) + , pfn_D3DKMTWaitForSynchronizationObjectFromGpu(nullptr) + , pfn_D3DKMTSignalSynchronizationObjectFromGpu(nullptr) + , pfn_D3DKMTWaitForSynchronizationObjectFromCpu(nullptr) + , pfn_D3DKMTQueryClockCalibration(nullptr) + , pfn_D3DKMTMakeResident(nullptr) + , pfn_D3DKMTEvict(nullptr) + , pfn_D3DKMTShareObjects(nullptr) + , pfn_D3DKMTQueryResourceInfoFromNtHandle(nullptr) + , pfn_D3DKMTOpenResourceFromNtHandle(nullptr) + , pfn_D3DKMTCreateHwQueue(nullptr) + , pfn_D3DKMTDestroyHwQueue(nullptr) + , pfn_D3DKMTSubmitCommandToHwQueue(nullptr) { +} + +DxcoreLoader::~DxcoreLoader() { + Shutdown(); +} + +bool DxcoreLoader::Initialize() { + dlerror(); // Clear error + dxcore_handle_ = dlopen("libdxcore.so", RTLD_LAZY); + + if (!dxcore_handle_) { + pr_err("[DxcoreLoader] Cannot load libdxcore.so: %s\n", dlerror()); + return false; + } + + pr_info("[DxcoreLoader] libdxcore.so loaded successfully\n"); + if (!LoadDxcoreApis()) { + // If API loading failed, close the handle to indicate failure + dlclose(dxcore_handle_); + dxcore_handle_ = nullptr; + return false; + } + + return IsLoaded(); +} + +void DxcoreLoader::Shutdown() { + if (dxcore_handle_) { + if (dlclose(dxcore_handle_) != 0) { + pr_err("[DxcoreLoader] Cannot unload libdxcore.so: %s\n", dlerror()); + } else { + pr_info("[DxcoreLoader] libdxcore.so unloaded successfully\n"); + } + dxcore_handle_ = nullptr; + } +} + +bool DxcoreLoader::LoadDxcoreApis() { + if (!dxcore_handle_) { + pr_err("[DxcoreLoader] Error: dxcore_handle_ is null\n"); + return false; + } + + dlerror(); // Clear error + + // Load all D3DKMT functions + #define LOAD_DXCORE_API(func_name) \ + DXCORE_PFN(func_name) = (DXCORE_DEF(func_name)*)dlsym(dxcore_handle_, #func_name); \ + if (!DXCORE_PFN(func_name)) { \ + pr_err("[DxcoreLoader] Failed to load " #func_name ": %s\n", dlerror()); \ + goto ERROR; \ + } + + LOAD_DXCORE_API(D3DKMTCreateAllocation2); + LOAD_DXCORE_API(D3DKMTDestroyAllocation2); + LOAD_DXCORE_API(D3DKMTMapGpuVirtualAddress); + LOAD_DXCORE_API(D3DKMTReserveGpuVirtualAddress); + LOAD_DXCORE_API(D3DKMTFreeGpuVirtualAddress); + LOAD_DXCORE_API(D3DKMTCreateDevice); + LOAD_DXCORE_API(D3DKMTDestroyDevice); + LOAD_DXCORE_API(D3DKMTEnumAdapters2); + LOAD_DXCORE_API(D3DKMTQueryAdapterInfo); + LOAD_DXCORE_API(D3DKMTCreateContextVirtual); + LOAD_DXCORE_API(D3DKMTDestroyContext); + LOAD_DXCORE_API(D3DKMTSubmitCommand); + LOAD_DXCORE_API(D3DKMTCreateSynchronizationObject2); + LOAD_DXCORE_API(D3DKMTDestroySynchronizationObject); + LOAD_DXCORE_API(D3DKMTQueryStatistics); + LOAD_DXCORE_API(D3DKMTEscape); + LOAD_DXCORE_API(D3DKMTLock2); + LOAD_DXCORE_API(D3DKMTUnlock2); + LOAD_DXCORE_API(D3DKMTCreatePagingQueue); + LOAD_DXCORE_API(D3DKMTDestroyPagingQueue); + LOAD_DXCORE_API(D3DKMTWaitForSynchronizationObjectFromGpu); + LOAD_DXCORE_API(D3DKMTSignalSynchronizationObjectFromGpu); + LOAD_DXCORE_API(D3DKMTWaitForSynchronizationObjectFromCpu); + LOAD_DXCORE_API(D3DKMTQueryClockCalibration); + LOAD_DXCORE_API(D3DKMTMakeResident); + LOAD_DXCORE_API(D3DKMTEvict); + LOAD_DXCORE_API(D3DKMTShareObjects); + LOAD_DXCORE_API(D3DKMTQueryResourceInfoFromNtHandle); + LOAD_DXCORE_API(D3DKMTOpenResourceFromNtHandle); + LOAD_DXCORE_API(D3DKMTCreateHwQueue); + LOAD_DXCORE_API(D3DKMTDestroyHwQueue); + LOAD_DXCORE_API(D3DKMTSubmitCommandToHwQueue); + + #undef LOAD_DXCORE_API + + pr_info("[DxcoreLoader] All DXCore APIs loaded successfully\n"); + return true; +ERROR: + pr_err("[DxcoreLoader] Failed to load DXCore APIs\n"); + return false; +} + +} // namespace dxcore +} // namespace thunk +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.h b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.h new file mode 100644 index 0000000000..3f649a4da0 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/dxcore_loader.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef LIBROCDXG_DXCORE_LOADER_H +#define LIBROCDXG_DXCORE_LOADER_H + +#include "impl/wddm/types.h" +#include +#include + +#define DXCORE_CALL(function_name) wsl::thunk::dxcore::DxcoreLoader::Instance().pfn_##function_name + +namespace wsl { +namespace thunk { +namespace dxcore { + +/** + * @brief DxcoreLoader class for dynamic loading of libdxcore.so + * + * This class provides a singleton loader for the DXCore library, allowing + * optional loading based on environment variable LIBROCDXG_ENABLE_DXCORE. + * Supported values: "1", "true", "yes" (case-sensitive). + * If not set or invalid, fallback to stub implementations. + * + * Thread-safe initialization using std::call_once. + */ + +// Macro definitions mimicking HSAKMT design +#define DXCORE_DEF(function_name) PFN##function_name +#define DXCORE_PFN(function_name) pfn_##function_name + +class DxcoreLoader { +public: + // D3DKMT function type definitions + typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateAllocation2))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyAllocation2))(void *args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTMapGpuVirtualAddress))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTReserveGpuVirtualAddress))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTFreeGpuVirtualAddress))(void *args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateDevice))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyDevice))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTEnumAdapters2))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryAdapterInfo))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateContextVirtual))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyContext))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTSubmitCommand))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateSynchronizationObject2))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroySynchronizationObject))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryStatistics))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTEscape))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTLock2))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTUnlock2))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTCreatePagingQueue))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyPagingQueue))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromGpu))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTSignalSynchronizationObjectFromGpu))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromCpu))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryClockCalibration))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTMakeResident))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTEvict))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTShareObjects))(size_t num_allocations, WinResourceHandle* resource, OBJECT_ATTRIBUTES* obj_attr, uint32_t flags, void** nt_handle); + typedef NTSTATUS (DXCORE_DEF(D3DKMTQueryResourceInfoFromNtHandle))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTOpenResourceFromNtHandle))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTCreateHwQueue))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTDestroyHwQueue))(void* args); + typedef NTSTATUS (DXCORE_DEF(D3DKMTSubmitCommandToHwQueue))(void* args); + + static DxcoreLoader& Instance() { + static DxcoreLoader* instance = new DxcoreLoader(); + return (*instance); + } + + bool Initialize(); + void Shutdown(); + bool IsLoaded() const { return dxcore_handle_ != nullptr; } + + // Function pointer declarations + DXCORE_DEF(D3DKMTCreateAllocation2)* DXCORE_PFN(D3DKMTCreateAllocation2); + DXCORE_DEF(D3DKMTDestroyAllocation2)* DXCORE_PFN(D3DKMTDestroyAllocation2); + DXCORE_DEF(D3DKMTMapGpuVirtualAddress)* DXCORE_PFN(D3DKMTMapGpuVirtualAddress); + DXCORE_DEF(D3DKMTReserveGpuVirtualAddress)* DXCORE_PFN(D3DKMTReserveGpuVirtualAddress); + DXCORE_DEF(D3DKMTFreeGpuVirtualAddress)* DXCORE_PFN(D3DKMTFreeGpuVirtualAddress); + DXCORE_DEF(D3DKMTCreateDevice)* DXCORE_PFN(D3DKMTCreateDevice); + DXCORE_DEF(D3DKMTDestroyDevice)* DXCORE_PFN(D3DKMTDestroyDevice); + DXCORE_DEF(D3DKMTEnumAdapters2)* DXCORE_PFN(D3DKMTEnumAdapters2); + DXCORE_DEF(D3DKMTQueryAdapterInfo)* DXCORE_PFN(D3DKMTQueryAdapterInfo); + DXCORE_DEF(D3DKMTCreateContextVirtual)* DXCORE_PFN(D3DKMTCreateContextVirtual); + DXCORE_DEF(D3DKMTDestroyContext)* DXCORE_PFN(D3DKMTDestroyContext); + DXCORE_DEF(D3DKMTSubmitCommand)* DXCORE_PFN(D3DKMTSubmitCommand); + DXCORE_DEF(D3DKMTCreateSynchronizationObject2)* DXCORE_PFN(D3DKMTCreateSynchronizationObject2); + DXCORE_DEF(D3DKMTDestroySynchronizationObject)* DXCORE_PFN(D3DKMTDestroySynchronizationObject); + DXCORE_DEF(D3DKMTQueryStatistics)* DXCORE_PFN(D3DKMTQueryStatistics); + DXCORE_DEF(D3DKMTEscape)* DXCORE_PFN(D3DKMTEscape); + DXCORE_DEF(D3DKMTLock2)* DXCORE_PFN(D3DKMTLock2); + DXCORE_DEF(D3DKMTUnlock2)* DXCORE_PFN(D3DKMTUnlock2); + DXCORE_DEF(D3DKMTCreatePagingQueue)* DXCORE_PFN(D3DKMTCreatePagingQueue); + DXCORE_DEF(D3DKMTDestroyPagingQueue)* DXCORE_PFN(D3DKMTDestroyPagingQueue); + DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromGpu)* DXCORE_PFN(D3DKMTWaitForSynchronizationObjectFromGpu); + DXCORE_DEF(D3DKMTSignalSynchronizationObjectFromGpu)* DXCORE_PFN(D3DKMTSignalSynchronizationObjectFromGpu); + DXCORE_DEF(D3DKMTWaitForSynchronizationObjectFromCpu)* DXCORE_PFN(D3DKMTWaitForSynchronizationObjectFromCpu); + DXCORE_DEF(D3DKMTQueryClockCalibration)* DXCORE_PFN(D3DKMTQueryClockCalibration); + DXCORE_DEF(D3DKMTMakeResident)* DXCORE_PFN(D3DKMTMakeResident); + DXCORE_DEF(D3DKMTEvict)* DXCORE_PFN(D3DKMTEvict); + DXCORE_DEF(D3DKMTShareObjects)* DXCORE_PFN(D3DKMTShareObjects); + DXCORE_DEF(D3DKMTQueryResourceInfoFromNtHandle)* DXCORE_PFN(D3DKMTQueryResourceInfoFromNtHandle); + DXCORE_DEF(D3DKMTOpenResourceFromNtHandle)* DXCORE_PFN(D3DKMTOpenResourceFromNtHandle); + DXCORE_DEF(D3DKMTCreateHwQueue)* DXCORE_PFN(D3DKMTCreateHwQueue); + DXCORE_DEF(D3DKMTDestroyHwQueue)* DXCORE_PFN(D3DKMTDestroyHwQueue); + DXCORE_DEF(D3DKMTSubmitCommandToHwQueue)* DXCORE_PFN(D3DKMTSubmitCommandToHwQueue); + +private: + DxcoreLoader(); + ~DxcoreLoader(); + + bool LoadDxcoreApis(); + + void* dxcore_handle_; + std::once_flag init_flag_; // For thread-safe initialization + + // Disable copy + DxcoreLoader(const DxcoreLoader&) = delete; + DxcoreLoader& operator=(const DxcoreLoader&) = delete; +}; + +} // namespace dxcore +} // namespace thunk +} // namespace wsl + +#endif // LIBROCDXG_DXCORE_LOADER_H diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/events.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/events.cpp new file mode 100644 index 0000000000..1a360832de --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/events.cpp @@ -0,0 +1,127 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, + bool ManualReset, bool IsSignaled, + HsaEvent **Event) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event) { + CHECK_DXG_OPEN(); + if (!Event) + return HSAKMT_STATUS_SUCCESS; + + pr_warn_once("not supported\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event, + HSAuint32 Milliseconds) { + return hsaKmtWaitOnEvent_Ext(Event, Milliseconds, NULL); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_Ext(HsaEvent *Event, + HSAuint32 Milliseconds, + uint64_t *event_age) { + if (!Event) + return HSAKMT_STATUS_INVALID_HANDLE; + + return hsaKmtWaitOnMultipleEvents_Ext(&Event, 1, true, Milliseconds, + event_age); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[], + HSAuint32 NumEvents, + bool WaitOnAll, + HSAuint32 Milliseconds) { + return hsaKmtWaitOnMultipleEvents_Ext(Events, NumEvents, WaitOnAll, + Milliseconds, NULL); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[], + HSAuint32 NumEvents, + bool WaitOnAll, + HSAuint32 Milliseconds, + uint64_t *event_age) { + CHECK_DXG_OPEN(); + + if (!Events) + return HSAKMT_STATUS_INVALID_HANDLE; + + if (NumEvents == 1 && Events[0] == nullptr) { + std::this_thread::sleep_for(std::chrono::microseconds(20)); + return HSAKMT_STATUS_SUCCESS; + } + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd) { + CHECK_DXG_OPEN(); + pr_debug("node id %d\n", NodeId); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/hsa.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/hsa.cpp new file mode 100755 index 0000000000..431e7bb91a --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/hsa.cpp @@ -0,0 +1,137 @@ +#include +#include "impl/hsa/hsa.h" +#include "impl/hsa/hsa_ven_amd_loader.h" + +static std::mutex* lock_ = new std::mutex(); + +#if 1 +#define _HSAKMT_LOOKUP_SYMS(_sym) \ +if (fn_##_sym == nullptr) { \ + std::lock_guard gard(*lock_); \ + if (fn_##_sym == nullptr) { \ + fn_##_sym = \ + reinterpret_cast(dlsym(RTLD_DEFAULT, #_sym)); \ + if (!fn_##_sym) { \ + pr_err("%s not found - %s\n", #_sym, dlerror()); \ + } \ + } \ +} + +#define _HSAKMT_EXEC_API(_sym, ...) \ +do { \ + if (fn_##_sym != nullptr) { \ + return fn_##_sym(__VA_ARGS__); \ + } \ +} while(0); + +bool hsakmt_hsa_loader_init() { + void *hsa_loader_handle = dlopen("libhsa-runtime64.so", RTLD_NOW | RTLD_GLOBAL); + if (hsa_loader_handle == nullptr) { + pr_err("dlopen libhsa-runtime64.so failed - %s\n", dlerror()); + return false; + } + dlclose(hsa_loader_handle); + return true; +} + +hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal) { + static hsa_signal_value_t (*fn_hsa_signal_load_relaxed)(hsa_signal_t signal) = nullptr; + + _HSAKMT_LOOKUP_SYMS(hsa_signal_load_relaxed); + _HSAKMT_EXEC_API(hsa_signal_load_relaxed, signal); + + return 0; +} + +hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed( + hsa_signal_t signal, hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint) { +static hsa_signal_value_t (*fn_hsa_signal_wait_relaxed)( + hsa_signal_t signal, hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint) = nullptr; + + _HSAKMT_LOOKUP_SYMS(hsa_signal_wait_relaxed); + _HSAKMT_EXEC_API(hsa_signal_wait_relaxed, signal, condition, compare_value, + timeout_hint, wait_state_hint); + + return 0; +} + +void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal, + hsa_signal_value_t value){ +static void (*fn_hsa_signal_store_screlease)(hsa_signal_t hsa_signal, + hsa_signal_value_t value) = nullptr; + + _HSAKMT_LOOKUP_SYMS(hsa_signal_store_screlease); + _HSAKMT_EXEC_API(hsa_signal_store_screlease, hsa_signal, value); +} + +hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address( + const void *device_address, const void **host_address) { + static hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)( + const void *device_address, const void **host_address) = nullptr; + + if (fn_hsa_ven_amd_loader_query_host_address == nullptr) { + std::lock_guard gard(*lock_); + if (fn_hsa_ven_amd_loader_query_host_address == nullptr) { + hsa_status_t (*fn_hsa_system_get_extension_table)( + uint16_t extension, uint16_t version_major, uint16_t version_minor, void *table); + fn_hsa_system_get_extension_table = + reinterpret_cast(dlsym(RTLD_DEFAULT, "hsa_system_get_extension_table")); + if (fn_hsa_system_get_extension_table == nullptr) { + pr_err("%s not found - %s\n", "hsa_system_get_extension_table", dlerror()); + return HSA_STATUS_ERROR; + } + + hsa_ven_amd_loader_1_03_pfn_t table; + fn_hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table); + fn_hsa_ven_amd_loader_query_host_address = + table.hsa_ven_amd_loader_query_host_address; + } + } + + _HSAKMT_EXEC_API(hsa_ven_amd_loader_query_host_address, device_address, host_address); + return HSA_STATUS_ERROR; +} + +#else +hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal) { + return hsa_signal_load_relaxed(signal); +} + +hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed( + hsa_signal_t signal, hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint) { + return hsa_signal_wait_relaxed(signal, condition, compare_value, timeout_hint, + wait_state_hint); +} + +void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal, + hsa_signal_value_t value) { + hsa_signal_store_screlease(hsa_signal, value); +} + +hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address( + const void *device_address, const void **host_address) { + static hsa_status_t (*fn_hsa_ven_amd_loader_query_host_address)( + const void *device_address, const void **host_address) = nullptr; + + if (fn_hsa_ven_amd_loader_query_host_address == nullptr) { + std::lock_guard gard(*lock_); + if (fn_hsa_ven_amd_loader_query_host_address == nullptr) { + hsa_ven_amd_loader_1_03_pfn_t table; + hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 3, &table); + fn_hsa_ven_amd_loader_query_host_address = + table.hsa_ven_amd_loader_query_host_address; + } + } + + if (fn_hsa_ven_amd_loader_query_host_address) + return fn_hsa_ven_amd_loader_query_host_address(device_address, host_address); + + return HSA_STATUS_ERROR; +} +#endif diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/hsakmtmodel.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/hsakmtmodel.cpp new file mode 100644 index 0000000000..6799f5d891 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/hsakmtmodel.cpp @@ -0,0 +1,31 @@ +/* +* Copyright © 2025 Advanced Micro Devices, Inc. +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, copy, +* modify, merge, publish, distribute, sublicense, and/or sell copies +* of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including +* the next paragraph) shall be included in all copies or substantial +* portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +*/ + +HSAKMT_STATUS HSAKMTAPI hsaKmtModelEnabled(bool* enable) +{ + *enable = false; + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_SUCCESS; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/libdrm.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/libdrm.cpp new file mode 100644 index 0000000000..2e125dfb3e --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/libdrm.cpp @@ -0,0 +1,182 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// +#include + +#include "impl/wddm/types.h" +#include "impl/wddm/device.h" + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandle( + HSAuint32 NodeId, HsaAMDGPUDeviceHandle *DeviceHandle) { + CHECK_DXG_OPEN(); + + wsl::thunk::WDDMDevice *pDevice = get_wddmdev(NodeId); + if (pDevice != nullptr) { + *DeviceHandle = reinterpret_cast(pDevice); + return HSAKMT_STATUS_SUCCESS; + } + return HSAKMT_STATUS_ERROR; +} + +HSAKMTAPI int amdgpu_device_initialize(int fd, + uint32_t *major_version, + uint32_t *minor_version, + amdgpu_device_handle *device_handle) { + return 0; +} + +HSAKMTAPI int amdgpu_device_deinitialize(amdgpu_device_handle device_handle) { + return 0; +} + +HSAKMTAPI int amdgpu_query_gpu_info(amdgpu_device_handle dev, + struct amdgpu_gpu_info *info) { + wsl::thunk::WDDMDevice *pDevice = + reinterpret_cast(dev); + memset(info, 0, sizeof(*info)); + info->gpu_counter_freq = pDevice->GPUCounterFrequency() / 1000ull; + return 0; +} + +HSAKMTAPI int amdgpu_device_get_fd(amdgpu_device_handle dev) { + return dxg_runtime->dxg_fd; +} + +HSAKMTAPI int amdgpu_bo_cpu_map(amdgpu_bo_handle bo, void **cpu) { + wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast(bo); + if (gpu_mem->IsSysMemFd()) + *cpu = gpu_mem->CpuAddress(); + return 0; +} + +HSAKMTAPI int amdgpu_bo_free(amdgpu_bo_handle buf_handle) { + wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast(buf_handle); + void *MemoryAddress = gpu_mem->IsVaAllocated() ? (void*)gpu_mem->GpuAddress() : (void*)gpu_mem->HandleApeAddress(); + auto ret = hsaKmtFreeMemory((void*)MemoryAddress, gpu_mem->Size()); + return ret == HSAKMT_STATUS_SUCCESS ? 0 : -1; +} + +HSAKMTAPI int amdgpu_bo_export(amdgpu_bo_handle bo, + enum amdgpu_bo_handle_type type, + uint32_t *shared_handle) { + *shared_handle = 0; + return 0; +} + +HSAKMTAPI int amdgpu_bo_import(amdgpu_device_handle dev, + enum amdgpu_bo_handle_type type, + uint32_t shared_handle, + struct amdgpu_bo_import_result *output) { + if (type != amdgpu_bo_handle_type_dma_buf_fd) { + pr_err("not implemented\n"); + return -1; + } + + + wsl::thunk::WDDMDevice *pDevice = reinterpret_cast(dev); + wsl::thunk::GpuMemoryHandle mem_handle; + bool is_ipc_memfd = is_ipc_sysmemfd(shared_handle); + bool alloc_va = is_ipc_memfd; + + HSAKMT_STATUS ret = import_dmabuf_fd(shared_handle, pDevice->NodeId(), + alloc_va, is_ipc_memfd, &mem_handle); + if (ret == HSAKMT_STATUS_SUCCESS) { + //use GpuMemory object handle as drm buf handle + output->buf_handle = reinterpret_cast(mem_handle); + return 0; + } else { + return -1; + } +} + +HSAKMTAPI int amdgpu_bo_va_op(amdgpu_bo_handle bo, + uint64_t offset, + uint64_t size, + uint64_t addr, + uint64_t flags, + uint32_t ops) { + wsl::thunk::GpuMemory *gpu_mem = reinterpret_cast(bo); + assert(gpu_mem != nullptr); + + switch(ops) { + case AMDGPU_VA_OP_MAP: + { + if (gpu_mem->GpuAddress() == addr) { + pr_info("bo is mapped already\n"); + return 0; + } else if (gpu_mem->GpuAddress()) { + pr_err("amdgpu_bo_va_op: GPU memory already mapped at %p, but requested to map at %p\n", + reinterpret_cast(gpu_mem->GpuAddress()), reinterpret_cast(addr)); + return -1; + } + auto code = gpu_mem->MapGpuVirtualAddress(reinterpret_cast(addr), size, offset); + if (code != ErrorCode::Success) + return -1; + + code = gpu_mem->MakeResident(); + if (code != ErrorCode::Success) + return -1; + } + break; + case AMDGPU_VA_OP_UNMAP: + { + auto code = gpu_mem->UnmapGpuVirtualAddress(reinterpret_cast(addr), size, offset); + if (code != ErrorCode::Success) + return -1; + gpu_mem->Evict(); + } + break; + } + return 0; +} + +HSAKMTAPI int amdgpu_bo_query_info(amdgpu_bo_handle bo, struct amdgpu_bo_info* info) { + return 0; +} + +HSAKMTAPI int amdgpu_bo_set_metadata(amdgpu_bo_handle bo, struct amdgpu_bo_metadata* info) { + return 0; +} + +HSAKMTAPI int drmCommandWriteRead(int fd, unsigned long drmCommandIndex, + void *data, unsigned long size) { + return 0; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.h b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.h new file mode 100644 index 0000000000..02826b22b0 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.h @@ -0,0 +1,289 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIBHSAKMT_H_INCLUDED +#define LIBHSAKMT_H_INCLUDED + +#include +#include +#include +#include "hsakmt/hsakmt.h" +#include "hsakmt/hsakmt_drm.h" + +#include "impl/wddm/va_mgr.h" +#include "impl/wddm/types.h" +#include "impl/wddm/device.h" +#include "dxcore_loader.h" + +wsl::thunk::WDDMDevice* get_wddmdev(uint32_t node_id); +uint32_t get_num_wddmdev(); +wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress); + +#define HSAKMT_DEBUG_LEVEL_ERR -1 +#define HSAKMT_DEBUG_LEVEL_DEFAULT 3 +#define HSAKMT_DEBUG_LEVEL_WARNING 4 +#define HSAKMT_DEBUG_LEVEL_INFO 6 +#define HSAKMT_DEBUG_LEVEL_DEBUG 7 + +struct hsakmtRuntime { + hsakmtRuntime() + : dxg_fd(-1), + parent_pid(getpid()), + is_forked(false), + hsakmt_debug_level(HSAKMT_DEBUG_LEVEL_DEFAULT), + dxg_open_count(0), + hsakmt_mutex(PTHREAD_MUTEX_INITIALIZER), + hsakmt_is_dgpu(false), + is_svm_api_supported(false), + zfb_support(0), + vendor_packet_process(0), + check_avail_sysram(false), + max_single_alloc_size(0), + enable_thunk_sub_allocator(0), + local_heap_space_start_(0), + local_heap_space_size_(0), + system_heap_space_start_(0), + system_heap_space_size_(0), + handle_aperture_start_(0), + handle_aperture_size_(0), + default_node(1) {} + + void HeapInit(); + void HeapFini(); + bool ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align); + bool FreeSvmSpace(uint64_t &base, uint64_t &size); + bool ReserveLocalHeapSpace(); + bool FreeLocalHeapSpace(); + void InitLocalHeapMgr(); + bool ReserveSystemHeapSpace(); + uint64_t SystemHeapSize() { return system_heap_space_size_; } + bool FreeSystemHeapSpace(); + bool CommitSystemHeapSpace(void* addr, int64_t size, bool lock); + bool DecommitSystemHeapSpace(void* addr, int64_t size); + void InitSystemHeapMgr(); + ErrorCode ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain domain, + gpusize hit_base_addr, gpusize size, + gpusize *out_gpu_virt_addr, gpusize alignment, bool lock); + ErrorCode FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domain, + gpusize gpu_addr, gpusize size); + bool CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &fd, bool lock=false); + bool DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd); + ErrorCode ReserveIPCSysMem(gpusize size, + gpusize *out_gpu_virt_addr, gpusize alignment, + int &memfd, bool lock); + ErrorCode FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd); + bool InitHandleApertureSpace(); + void InitHandleApertureMgr(); + ErrorCode HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr); + void HandleApertureFree(gpusize gpu_addr); + + pthread_mutex_t hsakmt_mutex; + const char *dxg_device_name = "/dev/dxg"; + long page_size; + int page_shift; + int dxg_fd = -1; + pid_t parent_pid = -1; + bool is_forked = false; + int hsakmt_debug_level = HSAKMT_DEBUG_LEVEL_DEFAULT; + unsigned long dxg_open_count; + bool hsakmt_is_dgpu; + bool is_svm_api_supported; + int zfb_support; + int vendor_packet_process; + bool check_avail_sysram; + size_t max_single_alloc_size; + int enable_thunk_sub_allocator; + uint32_t default_node; + + /* local heap means bo's backend is vram of all GPUs */ + uint64_t local_heap_space_start_; + uint64_t local_heap_space_size_; + + /* manage the reserved local heap space which shared by CPU and GPUs */ + std::unique_ptr local_heap_mgr_; + + /* system heap means bo's backend is system ram */ + uint64_t system_heap_space_start_; + uint64_t system_heap_space_size_; + + /* manage the reserved system heap space which shared by CPU and GPUs */ + std::unique_ptr system_heap_mgr_; + + uint64_t handle_aperture_start_; + uint64_t handle_aperture_size_; + std::unique_ptr handle_aperture_mgr_; +}; + +extern hsakmtRuntime *dxg_runtime; + +#undef HSAKMTAPI +#define HSAKMTAPI __attribute__((visibility ("default"))) + +#if defined(__clang__) +#if __has_feature(address_sanitizer) +#define SANITIZER_AMDGPU 1 +#endif +#endif + +/*Avoid pointer-to-int-cast warning*/ +#define PORT_VPTR_TO_UINT64(vptr) ((uint64_t)(unsigned long)(vptr)) + +/*Avoid int-to-pointer-cast warning*/ +#define PORT_UINT64_TO_VPTR(v) ((void*)(unsigned long)(v)) + +#define CHECK_DXG_OPEN() \ + do { if (dxg_runtime->dxg_open_count == 0 || dxg_runtime->is_forked) return HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; } while (0) + +/* 64KB BigK fragment size for TLB efficiency */ +#define GPU_BIGK_PAGE_SIZE (1 << 16) + +/* 2MB huge page size for 4-level page tables on Vega10 and later GPUs */ +#define GPU_HUGE_PAGE_SIZE (2 << 20) + +#define CHECK_PAGE_MULTIPLE(x) \ + do { if ((uint64_t)PORT_VPTR_TO_UINT64(x) % dxg_runtime->page_size) return HSAKMT_STATUS_INVALID_PARAMETER; } while(0) + +#define ALIGN_UP(x,align) (((uint64_t)(x) + (align) - 1) & ~(uint64_t)((align)-1)) +#define ALIGN_UP_32(x,align) (((uint32_t)(x) + (align) - 1) & ~(uint32_t)((align)-1)) +#define PAGE_ALIGN_UP(x) ALIGN_UP(x,dxg_runtime->page_size) +#define BITMASK(n) ((n) ? (UINT64_MAX >> (sizeof(UINT64_MAX) * CHAR_BIT - (n))) : 0) +#define ARRAY_LEN(array) (sizeof(array) / sizeof(array[0])) + +/* HSA Thunk logging usage */ +#define get_thread_id() \ + ([]() -> std::string { \ + std::stringstream str_thrd_id; \ + str_thrd_id << std::hex << std::this_thread::get_id(); \ + return str_thrd_id.str(); \ + })() +#define hsakmt_print_common(stream, fmt, ...) \ + do { \ + fprintf(stream, "pid:%d tid:0x%s [%s] " fmt, getpid(), get_thread_id().c_str(), __FUNCTION__, ##__VA_ARGS__); \ + fflush(stream); \ + } while (false) +#ifdef NDEBUG +#define hsakmt_print(level, fmt, ...) \ + do { } while (false) +#else +#define hsakmt_print(level, fmt, ...) \ + do { \ + if (level <= dxg_runtime->hsakmt_debug_level) { \ + hsakmt_print_common(stdout, fmt, ##__VA_ARGS__); \ + } \ + } while (false) +#endif + +#define pr_err(fmt, ...) \ + hsakmt_print_common(stderr, fmt, ##__VA_ARGS__) +#define pr_warn(fmt, ...) \ + hsakmt_print(HSAKMT_DEBUG_LEVEL_WARNING, fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) \ + hsakmt_print(HSAKMT_DEBUG_LEVEL_INFO, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) \ + hsakmt_print(HSAKMT_DEBUG_LEVEL_DEBUG, fmt, ##__VA_ARGS__) +#define pr_err_once(fmt, ...) \ +({ \ + static bool __print_once; \ + if (!__print_once) { \ + __print_once = true; \ + pr_err(fmt, ##__VA_ARGS__); \ + } \ +}) +#define pr_warn_once(fmt, ...) \ +({ \ + static bool __print_once; \ + if (!__print_once) { \ + __print_once = true; \ + pr_warn(fmt, ##__VA_ARGS__); \ + } \ +}) + +/* Expects HSA_ENGINE_ID.ui32, returns gfxv (full) in hex */ +#define HSA_GET_GFX_VERSION_FULL(ui32) \ + (((ui32.Major) << 16) | ((ui32.Minor) << 8) | (ui32.Stepping)) + +HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id); +HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id); +bool prefer_ats(HSAuint32 node_id); +uint16_t get_device_id_by_node_id(HSAuint32 node_id); +uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id); +uint32_t get_direct_link_cpu(uint32_t gpu_node); + +HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties& props); +HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId, + HsaNodeProperties *NodeProperties); +HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId, + HSAuint32 NumIoLinks, + HsaIoLinkProperties *IoLinkProperties); +void topology_setup_is_dgpu_param(HsaNodeProperties *props); + +HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags); + +uint32_t get_num_sysfs_nodes(void); + +bool is_forked_child(void); + +void clear_allocation_map(void); + +class BlockAllocator { +private: + static const size_t block_size_ = 128 * 1024 * 1024; // 128MB blocks. + +public: + void* alloc(size_t request_size, size_t& allocated_size) const; + void free(void* ptr, size_t length) const; + size_t block_size() const { return block_size_; } +}; + +void reset_suballocator(void); +void trim_suballocator(void); + +HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HSAuint64 Alignment, + HsaMemFlags MemFlags, + void **MemoryAddress, + bool SkipSubAlloc = false); + +HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress, + HSAuint64 SizeInBytes, + bool SkipSubAlloc = false); + +bool queue_acquire_buffer(void *MemoryAddress); +bool queue_release_buffer(void *MemoryAddress); +/* Calculate VGPR and SGPR register file size per CU */ +uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id); +#define SGPR_SIZE_PER_CU 0x4000 + +bool is_ipc_sysmemfd(int fd); + +HSAKMT_STATUS import_dmabuf_fd(int DMABufFd, + uint32_t NodeId, + bool alloc_va, + bool is_ipc_memfd, + wsl::thunk::GpuMemoryHandle *GpuMemHandle); + +bool hsakmt_hsa_loader_init(); +#endif diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.ver b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.ver new file mode 100644 index 0000000000..d91b29ec90 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/librocdxg.ver @@ -0,0 +1,113 @@ +HSAKMT_1 +{ +global: +hsaKmtOpenKFD; +hsaKmtCloseKFD; +hsaKmtGetVersion; +hsaKmtAcquireSystemProperties; +hsaKmtReleaseSystemProperties; +hsaKmtGetNodeProperties; +hsaKmtGetNodeMemoryProperties; +hsaKmtGetNodeCacheProperties; +hsaKmtGetNodeIoLinkProperties; +hsaKmtCreateEvent; +hsaKmtDestroyEvent; +hsaKmtSetEvent; +hsaKmtResetEvent; +hsaKmtQueryEventState; +hsaKmtWaitOnEvent; +hsaKmtWaitOnMultipleEvents; +hsaKmtCreateQueue; +hsaKmtCreateQueueExt; +hsaKmtUpdateQueue; +hsaKmtDestroyQueue; +hsaKmtSetQueueCUMask; +hsaKmtSetMemoryPolicy; +hsaKmtAllocMemory; +hsaKmtAllocMemoryAlign; +hsaKmtFreeMemory; +hsaKmtAvailableMemory; +hsaKmtRegisterMemory; +hsaKmtRegisterMemoryToNodes; +hsaKmtRegisterMemoryWithFlags; +hsaKmtRegisterGraphicsHandleToNodes; +hsaKmtRegisterGraphicsHandleToNodesExt; +hsaKmtShareMemory; +hsaKmtRegisterSharedHandle; +hsaKmtRegisterSharedHandleToNodes; +hsaKmtProcessVMRead; +hsaKmtProcessVMWrite; +hsaKmtDeregisterMemory; +hsaKmtMapMemoryToGPU; +hsaKmtMapMemoryToGPUNodes; +hsaKmtUnmapMemoryToGPU; +hsaKmtDbgRegister; +hsaKmtDbgUnregister; +hsaKmtDbgWavefrontControl; +hsaKmtDbgAddressWatch; +hsaKmtDbgEnable; +hsaKmtDbgDisable; +hsaKmtDbgGetDeviceData; +hsaKmtDbgGetQueueData; +hsaKmtGetClockCounters; +hsaKmtPmcGetCounterProperties; +hsaKmtPmcRegisterTrace; +hsaKmtPmcUnregisterTrace; +hsaKmtPmcAcquireTraceAccess; +hsaKmtPmcReleaseTraceAccess; +hsaKmtPmcStartTrace; +hsaKmtPmcQueryTrace; +hsaKmtPmcStopTrace; +hsaKmtMapGraphicHandle; +hsaKmtUnmapGraphicHandle; +hsaKmtSetTrapHandler; +hsaKmtGetTileConfig; +hsaKmtQueryPointerInfo; +hsaKmtSetMemoryUserData; +hsaKmtGetQueueInfo; +hsaKmtAllocQueueGWS; +hsaKmtRuntimeEnable; +hsaKmtRuntimeDisable; +hsaKmtCheckRuntimeDebugSupport; +hsaKmtGetRuntimeCapabilities; +hsaKmtDebugTrapIoctl; +hsaKmtSPMAcquire; +hsaKmtSPMRelease; +hsaKmtSPMSetDestBuffer; +hsaKmtSVMSetAttr; +hsaKmtSVMGetAttr; +hsaKmtSetXNACKMode; +hsaKmtGetXNACKMode; +hsaKmtOpenSMI; +hsaKmtExportDMABufHandle; +hsaKmtGetMemoryHandle; +hsaKmtWaitOnEvent_Ext; +hsaKmtWaitOnMultipleEvents_Ext; +hsaKmtReplaceAsanHeaderPage; +hsaKmtReturnAsanHeaderPage; +hsaKmtGetAMDGPUDeviceHandle; +hsaKmtPcSamplingQueryCapabilities; +hsaKmtPcSamplingCreate; +hsaKmtPcSamplingDestroy; +hsaKmtPcSamplingStart; +hsaKmtPcSamplingStop; +hsaKmtPcSamplingSupport; +hsaKmtAisReadWriteFile; +hsaKmtModelEnabled; +hsaKmtQueueRingDoorbell; +amdgpu_device_initialize; +amdgpu_device_deinitialize; +amdgpu_query_gpu_info; +amdgpu_bo_import; +amdgpu_bo_va_op; +amdgpu_device_get_fd; +amdgpu_bo_cpu_map; +amdgpu_bo_free; +amdgpu_bo_export; +amdgpu_bo_query_info; +amdgpu_bo_set_metadata; +drmCommandWriteRead; + +local: *; +}; + diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/memory.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/memory.cpp new file mode 100644 index 0000000000..b6ef48cf29 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/memory.cpp @@ -0,0 +1,989 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "impl/wddm/gpu_memory.h" +#include "util/simple_heap.h" + +struct Allocation { + Allocation() + : handle(0), cpu_addr(0), gpu_addr(0), size(0), userptr(false), + user_data(nullptr), size_requested(0), node_id(0), mem_flags_value(0), + dmabuf_fd(-1), rocr_userdata(nullptr) {} + Allocation(wsl::thunk::GpuMemoryHandle handle_arg, void *cpu_addr_arg, + uint64_t gpu_addr_arg, size_t size_arg, bool userptr_arg = false, + void *user_data_arg = nullptr, size_t user_size_arg = 0, + HSAuint32 node_id_arg = 0, HSAuint32 mem_flags_value_arg = 0) + : handle(handle_arg), cpu_addr(cpu_addr_arg), gpu_addr(gpu_addr_arg), + size(size_arg), userptr(userptr_arg), user_data(user_data_arg), + size_requested(user_size_arg), node_id(node_id_arg), + mem_flags_value(mem_flags_value_arg), dmabuf_fd(-1), rocr_userdata(nullptr) {} + + wsl::thunk::GpuMemoryHandle handle; + void *cpu_addr; + uint64_t gpu_addr; + bool userptr; + size_t size; /* actual size = align_up(size_requested, granularity) */ + void *user_data; + size_t size_requested; /* size requested by user */ + HSAuint32 node_id; + HSAuint32 mem_flags_value; + int dmabuf_fd; + void *rocr_userdata; +}; + +static std::map* allocation_map_ = new std::map(); +static std::mutex* allocation_map_lock_ = new std::mutex(); + +void clear_allocation_map(void) +{ + //delete allocation_map_lock_; + allocation_map_lock_ = new std::mutex(); + std::lock_guard lock(*allocation_map_lock_); + delete allocation_map_; + allocation_map_ = new std::map(); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node, + HSAuint32 DefaultPolicy, + HSAuint32 AlternatePolicy, + void *MemoryAddressAlternate, + HSAuint64 MemorySizeInBytes) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags) { + switch (pageSizeFlags) { + case HSA_PAGE_SIZE_4KB: + return 4 * 1024; + case HSA_PAGE_SIZE_64KB: + return 64 * 1024; + case HSA_PAGE_SIZE_2MB: + return 2 * 1024 * 1024; + case HSA_PAGE_SIZE_1GB: + return 1024 * 1024 * 1024; + default: + assert(false); + return 4 * 1024; + } +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HsaMemFlags MemFlags, + void **MemoryAddress) { + return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags, + MemoryAddress); +} + +#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0) + +bool isSystemMemoryAvailable(HSAuint64 SizeInBytes) { + struct sysinfo info; + if (sysinfo(&info) != 0) + return false; + return SizeInBytes <= info.freeram; +} + +void* BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const { + void *address; + HsaMemFlags MemFlags; + + MemFlags.Value = 0; + MemFlags.ui32.CoarseGrain = 1; + MemFlags.ui32.NoSubstitute = 1; + allocated_size = wsl::AlignUp(request_size, block_size()); + if (HSAKMT_STATUS_SUCCESS == hsaKmtAllocMemoryAlignInternal(1, allocated_size, 0, MemFlags, &address, true)) + return address; + + return nullptr; +} + +void BlockAllocator::free(void* ptr, size_t length) const { + if (HSAKMT_STATUS_SUCCESS != hsaKmtFreeMemoryInternal(ptr, length, true)) + pr_err("wsl-thunk: BlockAllocator::free() err, address %p, length:%zu\n", ptr, length); +} + +static wsl::SimpleHeap fragment_allocator_; + +void reset_suballocator(void) { + fragment_allocator_.reset(); +} + +void trim_suballocator(void) { + fragment_allocator_.trim(); +} + +HSAKMT_STATUS hsaKmtAllocMemoryAlignInternal(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HSAuint64 Alignment, + HsaMemFlags MemFlags, + void **MemoryAddress, + bool SkipSubAlloc) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (MemFlags.ui32.FixedAddress) { + if (*MemoryAddress == nullptr) + return HSAKMT_STATUS_INVALID_PARAMETER; + } else + *MemoryAddress = nullptr; + + uint32_t node = (PreferredNode == 0) ? dxg_runtime->default_node : PreferredNode; + wsl::thunk::WDDMDevice *dev = get_wddmdev(node); + if (!dev) + return HSAKMT_STATUS_ERROR; + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + wsl::thunk::GpuMemoryCreateInfo create_info{}; + create_info.size = SizeInBytes; + + /* If initialize scratch pool of GpuAgent, treat it as SVM reserve */ + if (MemFlags.ui32.Scratch && MemFlags.ui32.HostAccess && SizeInBytes > 0x80000000) + MemFlags.ui32.OnlyAddress = 1; + + create_info.alignment = Alignment; + create_info.va_hint = reinterpret_cast(*MemoryAddress); + if ((PreferredNode == 0 && MemFlags.ui32.HostAccess) + || dxg_runtime->zfb_support || MemFlags.ui32.GTTAccess) { + if (SizeInBytes > dxg_runtime->max_single_alloc_size) + return HSAKMT_STATUS_NO_MEMORY; + + if (dxg_runtime->check_avail_sysram && !isSystemMemoryAvailable(SizeInBytes)) + return HSAKMT_STATUS_NO_MEMORY; + + /* If allocate VRAM under ZFB mode */ + if (dxg_runtime->zfb_support && MemFlags.ui32.NonPaged == 1) + MemFlags.ui32.CoarseGrain = 1; + + // AllocateNonPaged == AllocateIPC + create_info.flags.sysmem_ipc_sig_exporter = !!(MemFlags.ui32.NonPaged && !MemFlags.ui32.GTTAccess); + + create_info.domain = thunk_proxy::AllocDomain::kSystem; + } else { + create_info.domain = thunk_proxy::AllocDomain::kLocal; + } + + if (!MemFlags.ui32.CoarseGrain) + create_info.mem_flags = thunk_proxy::kFineGrain; + + //In hsa-runtime, only kernarg region set Uncached. + if (MemFlags.ui32.Uncached) + create_info.mem_flags |= thunk_proxy::kKernarg; + + create_info.flags.physical_only = MemFlags.ui32.NoAddress; + create_info.flags.alloc_va = !create_info.flags.physical_only; + create_info.flags.interprocess = MemFlags.ui32.NoAddress; + create_info.flags.interprocess |= MemFlags.ui32.Contiguous; + create_info.flags.physical_contiguous = MemFlags.ui32.Contiguous; + create_info.flags.locked = MemFlags.ui32.NoSubstitute;//AllocatePinned + create_info.flags.virtual_alloc = MemFlags.ui32.OnlyAddress; + create_info.flags.blit_kernel_object = + (MemFlags.ui32.ExecuteBlit && MemFlags.ui32.ExecuteAccess && + (create_info.domain == thunk_proxy::AllocDomain::kSystem)); + /*when only alloc virtual or only physical, it's vmm allocation, force to local*/ + if (create_info.flags.virtual_alloc || create_info.flags.physical_only + || create_info.flags.physical_contiguous) { + create_info.domain = thunk_proxy::AllocDomain::kLocal; + SkipSubAlloc = true; + } + + /* Only allow using the suballocator for ordinary VRAM.*/ + bool trim_safe = false; + if (!SkipSubAlloc && create_info.domain == thunk_proxy::AllocDomain::kLocal) { + /* just quickly skip SA if size is bigger than SA block size.*/ + gpusize real_size; + if (create_info.size > GPU_HUGE_PAGE_SIZE) + real_size = wsl::AlignUp(create_info.size, GPU_HUGE_PAGE_SIZE); + else + real_size = wsl::AlignUp(create_info.size, getpagesize()); + + if (real_size < fragment_allocator_.default_block_size()) { + *MemoryAddress = fragment_allocator_.alloc(real_size); + if (*MemoryAddress) + return HSAKMT_STATUS_SUCCESS; + } + + /* SA might keep a lot of free blocks as *cache*. + * We can trim them if direct allocation fails at first time. + */ + trim_safe = true; + } + +after_trim: + auto code = dev->CreateGpuMemory(create_info, &gpu_mem); + if (code == ErrorCode::Success) { + std::lock_guard gard(*allocation_map_lock_); + + /* For these physical allcations, use GpuMemory object's address as thunk handle*/ + if (create_info.flags.physical_only || create_info.dmabuf_fd > 0) + *MemoryAddress = reinterpret_cast(gpu_mem->HandleApeAddress()); + else + *MemoryAddress = reinterpret_cast(gpu_mem->GpuAddress()); + + (*allocation_map_)[*MemoryAddress] = Allocation( + gpu_mem->GetGpuMemoryHandle(), *MemoryAddress, (uint64_t)*MemoryAddress, + create_info.size, false, nullptr, SizeInBytes, + MemFlags.ui32.GTTAccess ? 0 : PreferredNode, MemFlags.Value); + return HSAKMT_STATUS_SUCCESS; + } else if (trim_safe) { + /* attempt to release memory from the block allocator and retry */ + fragment_allocator_.trim(); + trim_safe = false; + goto after_trim; + } + + return HSAKMT_STATUS_ERROR; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode, + HSAuint64 SizeInBytes, + HSAuint64 Alignment, + HsaMemFlags MemFlags, + void **MemoryAddress) { + return hsaKmtAllocMemoryAlignInternal(PreferredNode, SizeInBytes, + Alignment, MemFlags, + MemoryAddress, + !dxg_runtime->enable_thunk_sub_allocator); +} + +HSAKMT_STATUS hsaKmtFreeMemoryInternal(void *MemoryAddress, + HSAuint64 SizeInBytes, + bool SkipSubAlloc) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (!SkipSubAlloc) { + if (fragment_allocator_.free(MemoryAddress)) + return HSAKMT_STATUS_SUCCESS; + } + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + { + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find(MemoryAddress); + if (it == allocation_map_->end()) { + return HSAKMT_STATUS_ERROR; + } + + gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + if (gpu_mem->IsQueueReferenced()) + return HSAKMT_STATUS_ERROR; + + wsl::thunk::GpuMemoryDescFlags flags; + flags.reserved = gpu_mem->Flags(); + if (flags.is_imported_vram_ipc && + gpu_mem->DecSharedReference()) { + pr_info("memory is still referenced\n"); + return HSAKMT_STATUS_SUCCESS; + } + + if (it->second.dmabuf_fd >= 0) { + close(it->second.dmabuf_fd); + it->second.dmabuf_fd = -1; + } + allocation_map_->erase(it); + } + + delete gpu_mem; + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress, + HSAuint64 SizeInBytes) { + return hsaKmtFreeMemoryInternal(MemoryAddress, SizeInBytes); +} + +bool queue_acquire_buffer(void *MemoryAddress) { + if (!MemoryAddress) + return false; + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + { + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find(MemoryAddress); + if (it == allocation_map_->end()) { + return HSAKMT_STATUS_ERROR; + } + + gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + gpu_mem->GetQueueReference(); + } + if (gpu_mem == nullptr) + return false; + + return true; +} + +bool queue_release_buffer(void *MemoryAddress) { + if (!MemoryAddress) + return false; + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + { + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find(MemoryAddress); + if (it == allocation_map_->end()) { + return HSAKMT_STATUS_ERROR; + } + + gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + gpu_mem->PutQueueReference(); + } + if (gpu_mem == nullptr) + return false; + + return true; +} + +wsl::thunk::GpuMemory *get_gpu_mem(void *MemoryAddress) { + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find(MemoryAddress); + if (it == allocation_map_->end()) { + return nullptr; + } + + return wsl::thunk::GpuMemory::Convert(it->second.handle); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node, + HSAuint64 *AvailableBytes) { + CHECK_DXG_OPEN(); + + if (!AvailableBytes) + return HSAKMT_STATUS_INVALID_PARAMETER; + + wsl::thunk::WDDMDevice *dev = get_wddmdev(Node); + if (!dev) + return HSAKMT_STATUS_ERROR; + + *AvailableBytes = dev->VramAvail(); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress, + HSAuint64 MemorySizeInBytes) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray) { + CHECK_DXG_OPEN(); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags( + void *MemoryAddress, HSAuint64 MemorySizeInBytes, HsaMemFlags MemFlags) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pr_debug("address %p\n", MemoryAddress); + + if (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.CoarseGrain) + return HSAKMT_STATUS_INVALID_PARAMETER; + + // Registered memory should be ordinary paged host memory. + if ((MemFlags.ui32.HostAccess != 1) || (MemFlags.ui32.NonPaged == 1)) + return HSAKMT_STATUS_NOT_SUPPORTED; + + if (!dxg_runtime->hsakmt_is_dgpu) + /* TODO: support mixed APU and dGPU configurations */ + return HSAKMT_STATUS_NOT_SUPPORTED; + + return HSAKMT_STATUS_SUCCESS; +} + +bool is_ipc_sysmemfd(int fd) { + std::string fdPath = "/proc/self/fd/" + std::to_string(fd); + char linkTarget[256]; + ssize_t bytes = readlink(fdPath.c_str(), linkTarget, sizeof(linkTarget) - 1); + if (bytes == -1) + return false; + linkTarget[bytes] = '\0'; + return strstr(linkTarget, "rocr4wsl_gtt") != nullptr; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsResourceHandle, + HsaGraphicsResourceInfo *GraphicsResourceInfo, + HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray) { + HSA_REGISTER_MEM_FLAGS regFlags; + regFlags.Value = 0; + + return hsaKmtRegisterGraphicsHandleToNodesExt(GraphicsResourceHandle, + GraphicsResourceInfo, + NumberOfNodes, + NodeArray, + regFlags); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 GraphicsResourceHandle, + HsaGraphicsResourceInfo *GraphicsResourceInfo, + HSAuint64 NumberOfNodes, + HSAuint32 *NodeArray, + HSA_REGISTER_MEM_FLAGS RegisterFlags) { + CHECK_DXG_OPEN(); + uint32_t *gpu_id_array = NULL; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + if (is_ipc_sysmemfd(GraphicsResourceHandle)) { + GraphicsResourceInfo->NodeId = dxg_runtime->default_node; + pr_info("skip register sysmemfd. It would be released in next step\n"); + return HSAKMT_STATUS_SUCCESS; + } + + if (NumberOfNodes == 0) { + RegisterFlags.ui32.requiresVAddr = 0; + NumberOfNodes = 1; + NodeArray = (HSAuint32*)&(dxg_runtime->default_node); + } + + pr_debug("number of nodes %lu\n", NumberOfNodes); + wsl::thunk::GpuMemoryHandle mem_handle; + ret = import_dmabuf_fd(GraphicsResourceHandle, NodeArray[0], + RegisterFlags.ui32.requiresVAddr, + false, &mem_handle); + if (ret != HSAKMT_STATUS_SUCCESS) { + pr_err("hsaKmtRegisterGraphicsHandleToNodesExt: import_dmabuf_fd failed, " + "GraphicsResourceHandle: %lu, NodeId: %u\n", + GraphicsResourceHandle, NodeArray[0]); + return ret; + } + wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(mem_handle); + GraphicsResourceInfo->NodeId = gpu_mem->GetDevice()->NodeId(); + GraphicsResourceInfo->SizeInBytes = gpu_mem->ClientSize(); + GraphicsResourceInfo->MemoryAddress = RegisterFlags.ui32.requiresVAddr ? + reinterpret_cast(gpu_mem->GpuAddress()): + reinterpret_cast(gpu_mem->HandleApeAddress()); + + return ret; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + int *DMABufFd, + HSAuint64 *Offset) { + CHECK_DXG_OPEN(); + + std::lock_guard gard(*allocation_map_lock_); + + auto it = allocation_map_->upper_bound(MemoryAddress); + if (it != allocation_map_->begin()) { + --it; + if (it->second.dmabuf_fd == -1) { + auto gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + auto code = gpu_mem->ExportPhysicalHandle(DMABufFd); + if (code != ErrorCode::Success) + return HSAKMT_STATUS_ERROR; + it->second.dmabuf_fd = *DMABufFd; + } + *DMABufFd = dup(it->second.dmabuf_fd); + *Offset = reinterpret_cast(MemoryAddress) - it->second.gpu_addr; + return HSAKMT_STATUS_SUCCESS; + } + + return HSAKMT_STATUS_ERROR; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetMemoryHandle(void *MemoryAddress, HSAuint64 SizeInBytes, + uint64_t *SharedMemoryHandle) { + CHECK_DXG_OPEN(); + + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS import_dmabuf_fd(int DMABufFd, + uint32_t NodeId, + bool alloc_va, + bool is_ipc_memfd, + wsl::thunk::GpuMemoryHandle *GpuMemHandle) { + CHECK_DXG_OPEN(); + + *GpuMemHandle = nullptr; + wsl::thunk::WDDMDevice* dev = get_wddmdev(NodeId); + wsl::thunk::GpuMemory *gpu_mem = nullptr; + wsl::thunk::GpuMemoryCreateInfo create_info{}; + create_info.dmabuf_fd = DMABufFd; + create_info.flags.alloc_va = alloc_va; + + if (is_ipc_memfd) { + struct stat st; + fstat(DMABufFd, &st); + uint64_t sz = st.st_size; + if (4096 <= sz && sz < dxg_runtime->SystemHeapSize() && (sz & 0xfff) == 0) { + pr_debug("DMABufFd %d is sys mem fd(IPC signal), get size:%ld from it\n", DMABufFd, st.st_size); + create_info.flags.sysmem_ipc_sig_importer = 1; // set to 1 when backend is system memory + create_info.size = st.st_size; + } + } + + gpusize gpu_va = 0; + auto code = dev->CreateGpuMemory(create_info, &gpu_mem, &gpu_va); + if (code == ErrorCode::SameProcessSameDevice) { + /* Unit_hipMemPoolExportToShareableHandle_SameProc */ + pr_info("imported from same process, use the old one\n"); + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find((void*)gpu_va); + if (it == allocation_map_->end()) { + pr_err("where's the conflict buffer? va %#lx\n", create_info.va_hint); + return HSAKMT_STATUS_ERROR; + } + wsl::thunk::GpuMemory *conflict_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + conflict_mem->IncSharedReference(); + *GpuMemHandle = it->second.handle; + return HSAKMT_STATUS_SUCCESS; + } else if (code != ErrorCode::Success) { + pr_err("fail to import fd, ret %d\n", (int)code); + return HSAKMT_STATUS_ERROR; + } + + void *MemoryAddress; + if (alloc_va) + MemoryAddress = reinterpret_cast(gpu_mem->GpuAddress()); + else + MemoryAddress = reinterpret_cast(gpu_mem->HandleApeAddress()); + + *GpuMemHandle = gpu_mem->GetGpuMemoryHandle(); + + std::lock_guard gard(*allocation_map_lock_); + /* + * the gpu_mem->Flags() need convert back from GpuMemoryCreateFlags to + * HsaMemFlags, reference hsaKmtAllocMemoryAlign + * */ + (*allocation_map_)[MemoryAddress] = Allocation( + *GpuMemHandle, MemoryAddress, (uint64_t)MemoryAddress, + gpu_mem->Size(), false, nullptr, gpu_mem->ClientSize(), + NodeId, gpu_mem->Flags()); + + return HSAKMT_STATUS_SUCCESS; + +} + + +HSAKMT_STATUS HSAKMTAPI +hsaKmtShareMemory(void *MemoryAddress, HSAuint64 SizeInBytes, + HsaSharedMemoryHandle *SharedMemoryHandle) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtRegisterSharedHandle(const HsaSharedMemoryHandle *SharedMemoryHandle, + void **MemoryAddress, HSAuint64 *SizeInBytes) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes( + const HsaSharedMemoryHandle *SharedMemoryHandle, void **MemoryAddress, + HSAuint64 *SizeInBytes, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMRead(HSAuint32 Pid, + HsaMemoryRange *LocalMemoryArray, + HSAuint64 LocalMemoryArrayCount, + HsaMemoryRange *RemoteMemoryArray, + HSAuint64 RemoteMemoryArrayCount, + HSAuint64 *SizeCopied) { + CHECK_DXG_OPEN(); + pr_warn_once("has been deprecated\n"); + assert(false); + return HSAKMT_STATUS_NOT_IMPLEMENTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid, + HsaMemoryRange *LocalMemoryArray, + HSAuint64 LocalMemoryArrayCount, + HsaMemoryRange *RemoteMemoryArray, + HSAuint64 RemoteMemoryArrayCount, + HSAuint64 *SizeCopied) { + CHECK_DXG_OPEN(); + pr_warn_once("has been deprecated\n"); + assert(false); + return HSAKMT_STATUS_NOT_IMPLEMENTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pr_debug("address %p\n", MemoryAddress); + + { + std::lock_guard gard(*allocation_map_lock_); + + auto it = allocation_map_->find(MemoryAddress); + if (it == allocation_map_->end()) { + return HSAKMT_STATUS_SUCCESS; + } + + auto *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + wsl::thunk::GpuMemoryDescFlags flags; + flags.reserved = gpu_mem->Flags(); + // IPC mem(vram) + if (flags.is_imported_vram_ipc && + gpu_mem->DecSharedReference() == 0) { + allocation_map_->erase(it); + delete gpu_mem; + return HSAKMT_STATUS_SUCCESS; + } + if (it->second.userptr) { + allocation_map_->erase(it); + allocation_map_->erase((void *)it->second.gpu_addr); + delete gpu_mem; + return HSAKMT_STATUS_SUCCESS; + } + } + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress, + HSAuint64 MemorySizeInBytes, + HSAuint64 *AlternateVAGPU) { + + HSAuint64 NumberOfNodes = 1; + HSAuint32 NodeArray[] = {dxg_runtime->default_node}; + HsaMemMapFlags MemMapFlags; + MemMapFlags.Value = 0; + + return hsaKmtMapMemoryToGPUNodes(MemoryAddress, MemorySizeInBytes, AlternateVAGPU, + MemMapFlags, NumberOfNodes, NodeArray); +} +HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes( + void *MemoryAddress, HSAuint64 MemorySizeInBytes, HSAuint64 *AlternateVAGPU, + HsaMemMapFlags MemMapFlags, HSAuint64 NumberOfNodes, HSAuint32 *NodeArray) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress || !AlternateVAGPU) { + pr_err("FIXME: mapping NULL pointer\n"); + return HSAKMT_STATUS_ERROR; + } + + uint64_t start = wsl::AlignDown((uint64_t)MemoryAddress, 4096); + uint64_t end = + wsl::AlignUp((uint64_t)MemoryAddress + MemorySizeInBytes, 4096); + + void *aligned_ptr = (void *)start; + size_t aligned_size = end - start; + + { + if (nullptr != fragment_allocator_.block_base(aligned_ptr)) + return HSAKMT_STATUS_SUCCESS; + } + + { + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find(aligned_ptr); + if (it != allocation_map_->end()) { + wsl::thunk::GpuMemory *gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + wsl::thunk::GpuMemoryDescFlags flags; + flags.reserved = gpu_mem->Flags(); + // IPC mem + if (flags.is_imported_vram_ipc) { + + auto code = gpu_mem->MapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size()); + if (code != ErrorCode::Success) + return HSAKMT_STATUS_ERROR; + + code = gpu_mem->MakeResident(); + if (code != ErrorCode::Success) + return HSAKMT_STATUS_ERROR; + + wsl::thunk::WDDMDevice *dev = gpu_mem->GetDevice(); + if (!dev->WaitOnPagingFenceFromCpu()) + return HSAKMT_STATUS_ERROR; + + return HSAKMT_STATUS_SUCCESS; + } + + if (!it->second.userptr) { + // GTT/Local mem + if (it->second.size >= MemorySizeInBytes) { + *AlternateVAGPU = (uint64_t)MemoryAddress; + return HSAKMT_STATUS_SUCCESS; + } else { + return HSAKMT_STATUS_ERROR; + } + } + } + + // userptr mem + it = allocation_map_->find(MemoryAddress); + if (it != allocation_map_->end()) { + if (it->second.userptr && it->second.size >= MemorySizeInBytes) { + *AlternateVAGPU = + (uintptr_t)it->second.gpu_addr + + ((uintptr_t)MemoryAddress - (uintptr_t)it->second.cpu_addr); + return HSAKMT_STATUS_SUCCESS; + } + } + } + + // map userptr + wsl::thunk::WDDMDevice *dev = get_wddmdev(NodeArray[0]); + if (!dev) + return HSAKMT_STATUS_ERROR; + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + wsl::thunk::GpuMemoryHandle handle = 0; + uint64_t addr; + wsl::thunk::GpuMemoryCreateInfo create_info{}; + create_info.domain = thunk_proxy::kUserMemory; + create_info.size = aligned_size; + create_info.user_ptr = aligned_ptr; + + auto code = dev->CreateGpuMemory(create_info, &gpu_mem); + if (code == ErrorCode::Success) { + addr = gpu_mem->GpuAddress(); + handle = gpu_mem->GetGpuMemoryHandle(); + } else { + return HSAKMT_STATUS_ERROR; + } + + { + std::lock_guard guard(*allocation_map_lock_); + (*allocation_map_)[MemoryAddress] = + Allocation(handle, aligned_ptr, addr, aligned_size, true, MemoryAddress, + MemorySizeInBytes); + (*allocation_map_)[(void *)addr] = + Allocation(handle, aligned_ptr, addr, aligned_size, true, nullptr, + MemorySizeInBytes); + } + + *AlternateVAGPU = addr + ((uintptr_t)MemoryAddress - (uintptr_t)aligned_ptr); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress) { + CHECK_DXG_OPEN(); + + if (!MemoryAddress) { + /* Workaround for runtime bug */ + pr_err("FIXME: Unmapping NULL pointer\n"); + return HSAKMT_STATUS_SUCCESS; + } + + pr_debug("address %p\n", MemoryAddress); + + { + if (nullptr != fragment_allocator_.block_base(MemoryAddress)) + return HSAKMT_STATUS_SUCCESS; + } + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + { + std::lock_guard gard(*allocation_map_lock_); + + auto it = allocation_map_->find(MemoryAddress); + if (it == allocation_map_->end()) { + return HSAKMT_STATUS_ERROR; + } + + gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + if (gpu_mem->IsQueueReferenced()) + return HSAKMT_STATUS_ERROR; + + // IPC mem + wsl::thunk::GpuMemoryDescFlags flags; + flags.reserved = gpu_mem->Flags(); + if (flags.is_imported_vram_ipc && + !gpu_mem->IsSharedFromSameProcess()) { + auto code = gpu_mem->UnmapGpuVirtualAddress(gpu_mem->GpuAddress(), gpu_mem->Size()); + if (code != ErrorCode::Success) + return HSAKMT_STATUS_ERROR; + gpu_mem->Evict(); + + return HSAKMT_STATUS_SUCCESS; + } + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtMapGraphicHandle(HSAuint32 NodeId, + HSAuint64 GraphicDeviceHandle, + HSAuint64 GraphicResourceHandle, + HSAuint64 GraphicResourceOffset, + HSAuint64 GraphicResourceSize, + HSAuint64 *FlatMemoryAddress) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + /* This API was only ever implemented in KFD for Kaveri and + * was never upstreamed. There are no open-source users of + * this interface. It has been superseded by + * RegisterGraphicsHandleToNodes. + */ + return HSAKMT_STATUS_NOT_IMPLEMENTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId, + HSAuint64 FlatMemoryAddress, + HSAuint64 SizeInBytes) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId, + HsaGpuTileConfig *config) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer, + HsaPointerInfo *PointerInfo) { + CHECK_DXG_OPEN(); + + if (!Pointer || !PointerInfo) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pr_debug("pointer %p\n", Pointer); + + memset(PointerInfo, 0, sizeof(HsaPointerInfo)); + + wsl::thunk::GpuMemory *gpu_mem = nullptr; + Allocation allocation_info; + bool found = false; + { + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->upper_bound(Pointer); + if (it != allocation_map_->begin()) { + --it; + if (Pointer >= it->first && + (Pointer < reinterpret_cast(it->first) + it->second.size_requested)) { + allocation_info = it->second; + gpu_mem = wsl::thunk::GpuMemory::Convert(it->second.handle); + found = true; + } + } + } + + if (!found) { + pr_debug("can't found allocation for %p\n", Pointer); + PointerInfo->Type = HSA_POINTER_UNKNOWN; + return HSAKMT_STATUS_ERROR; + } + + if (allocation_info.userptr) { + PointerInfo->Type = HSA_POINTER_REGISTERED_USER; + PointerInfo->SizeInBytes = allocation_info.size; + } else if (gpu_mem->IsVirtual()) { + PointerInfo->Type = HSA_POINTER_RESERVED_ADDR; + } else { + PointerInfo->Type = HSA_POINTER_ALLOCATED; + PointerInfo->SizeInBytes = allocation_info.size_requested; + } + + PointerInfo->Node = allocation_info.node_id; + PointerInfo->MemFlags.Value = allocation_info.mem_flags_value; + PointerInfo->CPUAddress = allocation_info.cpu_addr; + PointerInfo->GPUAddress = allocation_info.gpu_addr; + PointerInfo->UserData = allocation_info.rocr_userdata; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer, + void *UserData) { + CHECK_DXG_OPEN(); + + uint64_t aligned_ptr = wsl::AlignDown((uint64_t)Pointer, 4096); + + std::lock_guard gard(*allocation_map_lock_); + auto it = allocation_map_->find((void *)aligned_ptr); + if (it != allocation_map_->end()) { + it->second.rocr_userdata = UserData; + return HSAKMT_STATUS_SUCCESS; + } + + return HSAKMT_STATUS_ERROR; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + assert(false); +#ifdef SANITIZER_AMDGPU + pr_debug("address %p\n", addr); + CHECK_DXG_OPEN(); + + return HSAKMT_STATUS_SUCCESS; +#else + return HSAKMT_STATUS_NOT_SUPPORTED; +#endif +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + assert(false); +#ifdef SANITIZER_AMDGPU + pr_debug("address %p\n", addr); + CHECK_DXG_OPEN(); + + return HSAKMT_STATUS_SUCCESS; +#else + return HSAKMT_STATUS_NOT_SUPPORTED; +#endif +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/openclose.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/openclose.cpp new file mode 100644 index 0000000000..eb22a13aae --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/openclose.cpp @@ -0,0 +1,626 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +hsakmtRuntime *dxg_runtime = new hsakmtRuntime(); + +void hsakmtRuntime::HeapInit() { + ReserveLocalHeapSpace(); + ReserveSystemHeapSpace(); + InitHandleApertureSpace(); + InitLocalHeapMgr(); + InitSystemHeapMgr(); + InitHandleApertureMgr(); +} + +void hsakmtRuntime::HeapFini() { + FreeSystemHeapSpace(); + FreeLocalHeapSpace(); +} + +bool hsakmtRuntime::ReserveSvmSpace(uint64_t &base, uint64_t &size, uint64_t align) { + uint64_t sys_va[16] = {0}; + uint64_t local_va; + uint64_t sys_va_size; + int match_index = -1; + void* ptr = NULL; + + wsl::thunk::WDDMDevice* device; + size_t num_adapters = get_num_wddmdev(); + + base = 0; + sys_va_size = size + align; + + /* it will retry 16 times to find the avaliable range. */ + for (int i = 0; i < 16; i++) { + local_va = 0; + ptr = mmap(NULL, sys_va_size , PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (ptr == MAP_FAILED) { + pr_err("fail to reserve cpu va in %d time!\n", i); + break; + } + + sys_va[i] = (uint64_t)ptr; + + int match_cnt = 0; + for (uint32_t j = 0; j < num_adapters; j++) { + device = get_wddmdev(j+1); + uint64_t start = (base == 0) ? (uint64_t)ptr : base; + uint64_t end = start + ((base == 0) ? sys_va_size : size) + 1; + + if (wsl::thunk::d3dthunk::ReserveGpuVirtualAddress( + device->GetAdapter(), size, + start, + end, &local_va) == ErrorCode::Success) { + + match_cnt++; + base = local_va; + pr_debug("success to reserve gpu va %lx and va cpu %p in %d time\n", + local_va, ptr, i); + } else { + pr_err("%s fail to reserve gpu va for cpu va %p in %d time!\n", + __FUNCTION__, ptr, i); + } + } + + if (match_cnt == num_adapters) { + match_index = i; + break; + } + } + + if (match_index >= 0) { + /* release cpu unused ranges*/ + uint64_t left_size = local_va - sys_va[match_index]; + uint64_t right_size = align - left_size; + if ((left_size > 0) && munmap((void*)sys_va[match_index], left_size)) + pr_err("fail to unmap left %lx with size %lx\n", sys_va[match_index], left_size); + if ((right_size > 0) && munmap((void*)(local_va + size), right_size)) + pr_err("fail to unmap right %lx with size %lx\n", (local_va + size), right_size); + } else { + pr_err("fail to reserve Local Heap Space!\n"); + base = 0; + size = 0; + } + + /* free match fail address for cpu va */ + int free = match_index >= 0 ? match_index : 16; + for (int j = 0; j < free; j++) { + if (sys_va[j] != 0 && munmap((void*)sys_va[j], sys_va_size)) { + pr_err("fail to unmap %d %lx\n", j, sys_va[j]); + } + } + + return match_index >= 0; +} + +/* + * To find the avaliable same range for cpu + * virtual space and gpu virtual space. + * sys_va_size of cpu va range is larger 1G + * than gpu va range, otherwise ReserveGPUVirtualAddress + * will return error. + */ +bool hsakmtRuntime::ReserveLocalHeapSpace() { + wsl::thunk::WDDMDevice* device; + uint64_t total_local_size = 0; + uint64_t align = 0x40000000; /* 1G */ + size_t num_adapters = get_num_wddmdev(); + + for (uint32_t j = 0; j < num_adapters; j++) { + device = get_wddmdev(j+1); + if (device == nullptr) + return -1; + /* + * For APU, use non local memory(shared GPU memory) as GPU memory, + * because it has small local memory + */ + if (device->IsDgpu()) + total_local_size = wsl::Max(device->LocalHeapSize(), total_local_size); + else + total_local_size = wsl::Max(device->LocalHeapSize(), device->NonLocalHeapSize(), total_local_size); + } + + total_local_size = wsl::AlignUp(total_local_size, align) * 4; + local_heap_space_start_ = 0; + local_heap_space_size_ = total_local_size; + + return ReserveSvmSpace(local_heap_space_start_, local_heap_space_size_, align); +} + +bool hsakmtRuntime::FreeSvmSpace(uint64_t &base, uint64_t &size) { + wsl::thunk::WDDMDevice* device; + size_t num_adapters = get_num_wddmdev(); + for (uint32_t j = 0; j < num_adapters; j++) { + device = get_wddmdev(j+1); + if (device == nullptr) + return -1; + wsl::thunk::d3dthunk::FreeGpuVirtualAddress(device->GetAdapter(), base, size); + } + + void *cpu = (void *)base; + auto r = (munmap(cpu, size) == 0); + base = 0; + size = 0; + return r; +} + +bool hsakmtRuntime::FreeLocalHeapSpace() { + return FreeSvmSpace(local_heap_space_start_, local_heap_space_size_); +} + +void hsakmtRuntime::InitLocalHeapMgr() { + local_heap_mgr_ = std::make_unique(local_heap_space_start_, + local_heap_space_size_, + DEFAULT_GPU_PAGE_SIZE); +} + +bool hsakmtRuntime::ReserveSystemHeapSpace() { + struct sysinfo info; + int ret = sysinfo(&info); + uint64_t max_ram = 0x10000000000; + uint64_t alignment = 0x100000000; + assert(!ret); + + int32_t protFlags = PROT_NONE; + // minimum of reserve size is 8G, maximum of reserve size is 1T. + system_heap_space_size_ = std::min(wsl::AlignUp(info.totalram, alignment) * 2, max_ram); + + return ReserveSvmSpace(system_heap_space_start_, system_heap_space_size_, alignment); +} + +bool hsakmtRuntime::FreeSystemHeapSpace(void) { + return FreeSvmSpace(system_heap_space_start_, system_heap_space_size_); +} + +bool hsakmtRuntime::CommitSystemHeapSpace(void* addr, int64_t size, bool lock) { + int32_t protFlags = PROT_READ | PROT_WRITE | PROT_EXEC; + int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED| + MAP_NORESERVE|MAP_UNINITIALIZED; + if (lock) + mapFlags |= MAP_LOCKED; + void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0); + if (paddr == MAP_FAILED) { + pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr); + return false; + } + assert(addr == paddr); + + /*if (!Runtime::runtime_singleton_->PinWARequired()) + return true;*/ + + /* + * Do not make the pages in this range available to the child + * after a fork(2). This is useful to prevent copy-on-write + * semantics from changing the physical location of a page if + * the parent writes to it after a fork(2). (Such page + * relocations cause problems for hardware that DMAs into the + * page.) + * + * https://man7.org/linux/man-pages/man2/madvise.2.html + */ + if (madvise(addr, size, MADV_DONTFORK)) + pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr); + + return true; +} + +bool hsakmtRuntime::DecommitSystemHeapSpace(void* addr, int64_t size) { + int32_t protFlags = PROT_NONE; + int32_t mapFlags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED| + MAP_NORESERVE|MAP_UNINITIALIZED; + void* paddr = mmap(addr, size, protFlags, mapFlags, -1, 0); + if (paddr == MAP_FAILED) { + pr_err("fail to decommit addr = %p, paddr = %p\n", addr, paddr); + return false; + } + assert(addr == paddr); + return true; +} + +void hsakmtRuntime::InitSystemHeapMgr() { + system_heap_mgr_ = std::make_unique(system_heap_space_start_, + system_heap_space_size_, + DEFAULT_GPU_PAGE_SIZE); +} + +ErrorCode hsakmtRuntime::ReserveGpuVirtualAddress(const thunk_proxy::AllocDomain domain, + gpusize hit_base_addr, gpusize size, + gpusize *out_gpu_virt_addr, gpusize alignment, bool lock) { + gpusize gpu_addr = 0; + ErrorCode code = ErrorCode::Success; + + uint64_t align = alignment == 0 ? (64 * 1024) : alignment; // default 64K alignment + if (size >= GPU_HUGE_PAGE_SIZE) + align = GPU_HUGE_PAGE_SIZE; + + if (domain == thunk_proxy::kSystem) { + gpu_addr = system_heap_mgr_->Alloc(size, align, hit_base_addr); + if (gpu_addr == 0) + code = ErrorCode::OutOfMemory; + + if (!CommitSystemHeapSpace((void*)gpu_addr, size, lock)) { + system_heap_mgr_->Free(gpu_addr); + code = ErrorCode::SyscallFail; + } + } else { + gpu_addr = local_heap_mgr_->Alloc(size, align, hit_base_addr); + if (gpu_addr == 0) + code = ErrorCode::OutOfGpuMemory; + } + + *out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0; + return code; +} + +ErrorCode hsakmtRuntime::FreeGpuVirtualAddress(const thunk_proxy::AllocDomain domain, + gpusize gpu_addr, gpusize size) { + auto code = ErrorCode::Success; + + if (domain == thunk_proxy::kSystem) { + DecommitSystemHeapSpace((void *)gpu_addr, size); + system_heap_mgr_->Free(gpu_addr); + } else { + local_heap_mgr_->Free(gpu_addr); + } + + return code; +} + +bool hsakmtRuntime::CommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd, bool lock) { + int fd = -1; + + if (memfd == -1) { + fd = memfd_create("rocr4wsl_gtt", MFD_CLOEXEC); + if (fd < 0) { + pr_err("memfd_create failed\n"); + return false; + } + + ftruncate(fd, size); + } else { + fd = memfd; + } + + int32_t protFlags = PROT_READ | PROT_WRITE; + int32_t mapFlags = MAP_SHARED | MAP_FIXED | MAP_NORESERVE | + MAP_UNINITIALIZED | (lock ? MAP_LOCKED : 0); + + void* paddr = mmap(addr, size, protFlags, mapFlags, fd, 0); + if (paddr == MAP_FAILED) { + pr_err("fail to commit %s addr = %p, paddr = %p\n", (lock ? "locked" : ""), addr, paddr); + if (memfd == -1) + close(fd); + return false; + } + assert(addr == paddr); + + memfd = fd; + + if (madvise(addr, size, MADV_DONTFORK)) + pr_err("fail to set MADV_DONTFORK for addr = %p\n", addr); + + return true; +} + +bool hsakmtRuntime::DecommitSystemHeapSpaceIPC(void* addr, int64_t size, int &memfd) { + if (munmap(addr, size) != 0) { + pr_err("fail to unmap = %p \n", addr); + return false; + } + close(memfd); + memfd = -1; + return true; +} + +ErrorCode hsakmtRuntime::ReserveIPCSysMem(gpusize size, + gpusize *out_gpu_virt_addr, gpusize alignment, + int &memfd, bool lock) { + gpusize gpu_addr = 0; + ErrorCode code = ErrorCode::Success; + gpu_addr = system_heap_mgr_->Alloc(size, alignment, 0); + if (gpu_addr == 0) + return ErrorCode::OutOfMemory; + + if (!CommitSystemHeapSpaceIPC((void*)gpu_addr, size, memfd, lock)) { + system_heap_mgr_->Free(gpu_addr); + code = ErrorCode::SyscallFail; + } + + *out_gpu_virt_addr = (code == ErrorCode::Success) ? gpu_addr : 0; + return code; +} + +ErrorCode hsakmtRuntime::FreeIPCSysMem(gpusize gpu_addr, gpusize size, int &memfd) { + auto code = ErrorCode::Success; + + DecommitSystemHeapSpaceIPC((void *)gpu_addr, size, memfd); + + system_heap_mgr_->Free(gpu_addr); + return code; +} + +bool hsakmtRuntime::InitHandleApertureSpace() { + wsl::thunk::WDDMDevice* device; + size_t num_adapters = get_num_wddmdev(); + handle_aperture_start_ = START_NON_CANONICAL_ADDR; + handle_aperture_size_ = 1ULL << 47; + + while (handle_aperture_start_ < END_NON_CANONICAL_ADDR - 1) { + for (uint32_t j = 0; j < num_adapters;) { + device = get_wddmdev(j+1); + if (device == nullptr) + return -1; + + if (device->PrivateApertureBase() && + IS_OVERLAPPING(device->PrivateApertureBase(), + device->PrivateApertureSize(), + handle_aperture_start_, + handle_aperture_size_)) { + handle_aperture_start_ += (1ULL << 47); + continue; + } + + if (device->SharedApertureBase() && + IS_OVERLAPPING(device->SharedApertureBase(), + device->SharedApertureSize(), + handle_aperture_start_, + handle_aperture_size_)) { + handle_aperture_start_ += (1ULL << 47); + continue; + } + + j++; + } + pr_debug("handle aperture start %lx, size %lx\n", handle_aperture_start_, handle_aperture_size_); + return true; + } + + handle_aperture_start_ = 0; + pr_err("fail\n"); + + return false; +} + +void hsakmtRuntime::InitHandleApertureMgr() { + handle_aperture_mgr_ = std::make_unique(handle_aperture_start_, + handle_aperture_size_, + DEFAULT_GPU_PAGE_SIZE); +} + +ErrorCode hsakmtRuntime::HandleApertureAlloc(gpusize size, gpusize *out_gpu_virt_addr) { + uint64_t align = DEFAULT_GPU_PAGE_SIZE; + + if (size >= GPU_HUGE_PAGE_SIZE) + align = GPU_HUGE_PAGE_SIZE; + + *out_gpu_virt_addr = handle_aperture_mgr_->Alloc(size, align); + if (*out_gpu_virt_addr == 0) + return ErrorCode::OutOfHandleApeMemory; + + return ErrorCode::Success; +} + +void hsakmtRuntime::HandleApertureFree(gpusize gpu_addr) { + handle_aperture_mgr_->Free(gpu_addr); +} + +/* is_forked_child detects when the process has forked since the last + * time this function was called. We cannot rely on pthread_atfork + * because the process can fork without calling the fork function in + * libc (using clone or calling the system call directly). + */ +bool is_forked_child(void) { + if (dxg_runtime->is_forked) + return true; + + pid_t cur_pid = getpid(); + if (dxg_runtime->parent_pid != cur_pid) { + dxg_runtime->is_forked = true; + dxg_runtime->parent_pid = cur_pid; + return true; + } + + return false; +} + +/* Callbacks from pthread_atfork */ +static void prepare_fork_handler(void) { pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); } +static void parent_fork_handler(void) { pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); } +static void child_fork_handler(void) { + pthread_mutex_init(&dxg_runtime->hsakmt_mutex, NULL); + dxg_runtime->is_forked = true; +} + +/* Call this from the child process after fork. This will clear all + * data that is duplicated from the parent process, that is not valid + * in the child. + * The topology information is duplicated from the parent is valid + * in the child process so it is not cleared + */ +static void clear_after_fork(void) { + reset_suballocator(); + clear_allocation_map(); + + if (dxg_runtime->dxg_fd >= 0) { + close(dxg_runtime->dxg_fd); + dxg_runtime->dxg_fd = -1; + } + delete dxg_runtime; + dxg_runtime = new hsakmtRuntime(); + +} + +static inline void init_page_size(void) { + dxg_runtime->page_size = sysconf(_SC_PAGESIZE); + dxg_runtime->page_shift = ffs(dxg_runtime->page_size) - 1; +} + +static HSAKMT_STATUS init_vars_from_env(void) { + char *envvar; + int debug_level; + + /* Normally libraries don't print messages. For debugging purpose, we'll + * print messages if an environment variable, HSAKMT_DEBUG_LEVEL, is set. + */ + envvar = getenv("HSAKMT_DEBUG_LEVEL"); + if (envvar) { + dxg_runtime->hsakmt_debug_level = atoi(envvar); + } + + /* Check whether to support Zero frame buffer */ + envvar = getenv("HSA_ZFB"); + if (envvar) + dxg_runtime->zfb_support = atoi(envvar); + + /* Check whether to handle vendor specific aql packet */ + envvar = getenv("WSLKMT_VENDOR_PACKET"); + if (envvar) + dxg_runtime->vendor_packet_process = atoi(envvar); + + /* Decide whether to check available system memory before allocation */ + envvar = getenv("WSL_CHECK_AVAIL_SYSRAM"); + if (envvar) + dxg_runtime->check_avail_sysram = !strcmp(envvar, "1"); + + envvar = getenv("WSL_ENABLE_THUNK_SUB_ALLOCATOR"); + if (envvar) + dxg_runtime->enable_thunk_sub_allocator = atoi(envvar); + + envvar = getenv("ROCR_VISIBLE_DEVICES"); + if (envvar) { + std::string devices(envvar); + size_t first_num_pos = devices.find_first_of("0123456789"); + if (first_num_pos != std::string::npos) + dxg_runtime->default_node = std::stoi(devices.substr(first_num_pos)) + 1; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void) { + HSAKMT_STATUS result; + int fd = -1; + HsaSystemProperties sys_props; + char *error; + + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + /* If the process has forked, the child process must re-initialize + * it's connection to DXG. Any references tracked by dxg_open_count + * belong to the parent + */ + if (is_forked_child()) + clear_after_fork(); + + if (dxg_runtime->dxg_open_count == 0) { + static bool atfork_installed = false; + + result = init_vars_from_env(); + if (result != HSAKMT_STATUS_SUCCESS) + goto open_failed; + + if (dxg_runtime->dxg_fd < 0) { + fd = open(dxg_runtime->dxg_device_name, O_RDWR | O_CLOEXEC); + + if (fd == -1) { + result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; + goto open_failed; + } + + dxg_runtime->dxg_fd = fd; + } + if (!wsl::thunk::dxcore::DxcoreLoader::Instance().Initialize()) { + pr_err("Failed to load libdxcore.so\n"); + result = HSAKMT_STATUS_ERROR; + goto dxcore_loader_failed; + } + + hsakmt_hsa_loader_init(); + init_page_size(); + + char *useSvmStr = getenv("HSA_USE_SVM"); + dxg_runtime->is_svm_api_supported = !(useSvmStr && !strcmp(useSvmStr, "0")) && false; + + dxg_runtime->dxg_open_count = 1; + + if (!atfork_installed) { + /* Atfork handlers cannot be uninstalled and + * must be installed only once. Otherwise + * prepare will deadlock when trying to take + * the same lock multiple times. + */ + pthread_atfork(prepare_fork_handler, parent_fork_handler, + child_fork_handler); + atfork_installed = true; + } + } else { + dxg_runtime->dxg_open_count++; + result = HSAKMT_STATUS_KERNEL_ALREADY_OPENED; + } + + reset_suballocator(); + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + return result; +dxcore_loader_failed: + close(fd); +open_failed: + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + + return result; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void) { + HSAKMT_STATUS result; + + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + if (dxg_runtime->dxg_open_count > 0) { + if (--dxg_runtime->dxg_open_count == 0) { + close(dxg_runtime->dxg_fd); + dxg_runtime->dxg_fd = -1; + wsl::thunk::dxcore::DxcoreLoader::Instance().Shutdown(); + } + + result = HSAKMT_STATUS_SUCCESS; + } else + result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; + + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + + return result; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/pc_sampling.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/pc_sampling.cpp new file mode 100644 index 0000000000..6c6a9e2a04 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/pc_sampling.cpp @@ -0,0 +1,78 @@ +/* + * Copyright © 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingSupport(void) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtPcSamplingQueryCapabilities(HSAuint32 NodeId, void *sample_info, + HSAuint32 sample_info_sz, HSAuint32 *size) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingCreate(HSAuint32 NodeId, + HsaPcSamplingInfo *sample_info, + HsaPcSamplingTraceId *traceId) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingDestroy(HSAuint32 NodeId, + HsaPcSamplingTraceId traceId) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStart(HSAuint32 NodeId, + HsaPcSamplingTraceId traceId) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStop(HSAuint32 NodeId, + HsaPcSamplingTraceId traceId) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/perfctr.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/perfctr.cpp new file mode 100644 index 0000000000..9189d2dafa --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/perfctr.cpp @@ -0,0 +1,90 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties( + HSAuint32 NodeId, HsaCounterProperties **CounterProperties) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/* Registers a set of (HW) counters to be used for tracing/profiling */ +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId, + HSAuint32 NumberOfCounters, + HsaCounter *Counters, + HsaPmcTraceRoot *TraceRoot) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/* Unregisters a set of (HW) counters used for tracing/profiling */ + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId, + HSATraceId TraceId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId, + HSATraceId TraceId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcReleaseTraceAccess(HSAuint32 NodeId, + HSATraceId TraceId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/* Starts tracing operation on a previously established set of performance + * counters */ +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStartTrace(HSATraceId TraceId, + void *TraceBuffer, + HSAuint64 TraceBufferSizeBytes) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/*Forces an update of all the counters that a previously started trace operation + * has registered */ +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcQueryTrace(HSATraceId TraceId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +/* Stops tracing operation on a previously established set of performance + * counters */ +HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStopTrace(HSATraceId TraceId) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/queues.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/queues.cpp new file mode 100644 index 0000000000..edaaea9d1a --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/queues.cpp @@ -0,0 +1,216 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include "impl/wddm/device.h" +#include "impl/wddm/queue.h" +#include "impl/hsa/amd_hsa_signal.h" + +uint32_t get_vgpr_size_per_cu(HSA_ENGINE_ID id) { + uint32_t vgpr_size = 0x40000; + + uint32_t gfxv = HSA_GET_GFX_VERSION_FULL(id.ui32); + if( gfxv == 0x1100 || gfxv == 0x1101 || + gfxv == 0x1151 || + gfxv == 0x1200 || gfxv ==0x1201) { + vgpr_size = 0x60000; + } + + return vgpr_size; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId, + HSA_QUEUE_TYPE Type, + HSAuint32 QueuePercentage, + HSA_QUEUE_PRIORITY Priority, + void *QueueAddress, + HSAuint64 QueueSizeInBytes, + HsaEvent *Event, + HsaQueueResource *QueueResource) +{ + if (Type == HSA_QUEUE_SDMA_BY_ENG_ID) + return HSAKMT_STATUS_ERROR; + + return hsaKmtCreateQueueExt(NodeId, Type, QueuePercentage, Priority, 0, + QueueAddress, QueueSizeInBytes, Event, + QueueResource); +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId, + HSA_QUEUE_TYPE Type, + HSAuint32 QueuePercentage, + HSA_QUEUE_PRIORITY Priority, + HSAuint32 SdmaEngineId, + void *QueueAddress, + HSAuint64 QueueSizeInBytes, + HsaEvent *Event, + HsaQueueResource *QueueResource) { + HSAKMT_STATUS result; + + CHECK_DXG_OPEN(); + assert(Event == nullptr); + + if (Priority < HSA_QUEUE_PRIORITY_MINIMUM || + Priority > HSA_QUEUE_PRIORITY_MAXIMUM) + return HSAKMT_STATUS_INVALID_PARAMETER; + + wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId); + assert(device_); + + if (queue_acquire_buffer(QueueAddress) == false) + return HSAKMT_STATUS_INVALID_PARAMETER; + + switch (Type) { + case HSA_QUEUE_COMPUTE_AQL: { + assert(QueueResource->ErrorReason == nullptr); + uint64_t pkg_num = QueueSizeInBytes / 64; + uint32_t cmdbuf_size = device_->GetCmdbufSize(); + uint32_t queue_engine = device_->GetComputeEngine(); + bool use_hws = device_->IsHwsEnabled(queue_engine); + auto queue_ = new wsl::thunk::ComputeQueue( + device_, QueueAddress, pkg_num, + reinterpret_cast *>( + QueueResource->Queue_write_ptr_aql), + reinterpret_cast *>( + QueueResource->Queue_read_ptr_aql), + QueueResource->ErrorReason, cmdbuf_size, queue_engine, use_hws); + + QueueResource->QueueId = reinterpret_cast(queue_); + // for doorbell_signal.hardware_doorbell_ptr + QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr(); + } break; + case HSA_QUEUE_SDMA: + case HSA_QUEUE_SDMA_BY_ENG_ID: { + pr_debug("create sdma queue in engine %d\n", SdmaEngineId); + uint32_t queue_engine = device_->GetSdmaEngine(0); // TODO: SdmaEngineId + bool use_hws = device_->IsHwsEnabled(queue_engine); + auto queue_ = new wsl::thunk::SDMAQueue( + device_, QueueAddress, QueueSizeInBytes, + queue_engine, use_hws); + QueueResource->QueueId = reinterpret_cast(queue_); + QueueResource->Queue_DoorBell_aql = queue_->GetDoorbellPtr(); + QueueResource->Queue_write_ptr_aql = queue_->GetRingWptr(); + QueueResource->Queue_read_ptr_aql = queue_->GetRingRptr(); + } break; + default: + assert(false); + QueueResource->QueueId = 0; + QueueResource->Queue_DoorBell = nullptr; + break; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue( + HSA_QUEUEID QueueId, HSAuint32 QueuePercentage, HSA_QUEUE_PRIORITY Priority, + void *QueueAddress, HSAuint64 QueueSize, HsaEvent *Event) { + CHECK_DXG_OPEN(); + + if (Priority < HSA_QUEUE_PRIORITY_MINIMUM || + Priority > HSA_QUEUE_PRIORITY_MAXIMUM) + return HSAKMT_STATUS_INVALID_PARAMETER; + + auto queue_ = reinterpret_cast(QueueId); + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId) { + CHECK_DXG_OPEN(); + + auto queue_ = reinterpret_cast(QueueId); + void *QueueAddress = queue_->GetHsaQueueAddr(); + + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + delete queue_; + queue_release_buffer(QueueAddress); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId, + HSAuint32 CUMaskCount, + HSAuint32 *QueueCUMask) { + CHECK_DXG_OPEN(); + + auto queue_ = reinterpret_cast(QueueId); + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (CUMaskCount == 0 || !QueueCUMask || ((CUMaskCount % 32) != 0)) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pr_warn_once("not implemented\n"); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetQueueInfo(HSA_QUEUEID QueueId, + HsaQueueInfo *QueueInfo) { + CHECK_DXG_OPEN(); + + if (QueueInfo == NULL) + return HSAKMT_STATUS_INVALID_PARAMETER; + memset(QueueInfo, 0, sizeof(*QueueInfo)); + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node, + void *TrapHandlerBaseAddress, + HSAuint64 TrapHandlerSizeInBytes, + void *TrapBufferBaseAddress, + HSAuint64 TrapBufferSizeInBytes) { + CHECK_DXG_OPEN(); + pr_warn_once("not implemented\n"); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtAllocQueueGWS(HSA_QUEUEID QueueId, HSAuint32 nGWS, + HSAuint32 *firstGWS) { + CHECK_DXG_OPEN(); + + auto queue_ = reinterpret_cast(QueueId); + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + assert(false); + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtQueueRingDoorbell(HSA_QUEUEID QueueId) { + CHECK_DXG_OPEN(); + + auto queue_ = reinterpret_cast(QueueId); + if (!queue_) + return HSAKMT_STATUS_INVALID_PARAMETER; + + queue_->RingDoorbell(); + return HSAKMT_STATUS_SUCCESS; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/spm.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/spm.cpp new file mode 100644 index 0000000000..14b0faf1f8 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/spm.cpp @@ -0,0 +1,50 @@ +/* + * Copyright © 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer( + HSAuint32 PreferredNode, HSAuint32 SizeInBytes, HSAuint32 *timeout, + HSAuint32 *SizeCopied, void *DestMemoryAddress, bool *isSPMDataLoss) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode) { + CHECK_DXG_OPEN(); + // Used for profiling tools + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/svm.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/svm.cpp new file mode 100644 index 0000000000..f2f8a10f68 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/svm.cpp @@ -0,0 +1,55 @@ +/* + * Copyright © 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* Helper functions for calling KFD SVM ioctl */ + +HSAKMT_STATUS HSAKMTAPI hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, + unsigned int nattr, + HSA_SVM_ATTRIBUTE *attrs) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, + unsigned int nattr, + HSA_SVM_ATTRIBUTE *attrs) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtSetXNACKMode(HSAint32 enable) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + return HSAKMT_STATUS_NOT_SUPPORTED; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetXNACKMode(HSAint32 *enable) { + CHECK_DXG_OPEN(); + pr_warn_once("not supported\n"); + *enable = false; + return HSAKMT_STATUS_SUCCESS; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/thunk_proxy/libthunk_proxy.a b/projects/rocr-runtime/libhsakmt/src/dxg/thunk_proxy/libthunk_proxy.a new file mode 100644 index 0000000000..3b21eb936d Binary files /dev/null and b/projects/rocr-runtime/libhsakmt/src/dxg/thunk_proxy/libthunk_proxy.a differ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/time.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/time.cpp new file mode 100644 index 0000000000..a28bb29215 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/time.cpp @@ -0,0 +1,49 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include "impl/wddm/device.h" + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId, + HsaClockCounters *Counters) { + HSAKMT_STATUS result = HSAKMT_STATUS_SUCCESS; + + CHECK_DXG_OPEN(); + + std::memset(Counters, 0, sizeof(*Counters)); + + wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId); + assert(device_); + device_->GetClockCounters(&Counters->GPUClockCounter, &Counters->CPUClockCounter); + + struct timespec ts; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts) == 0) + Counters->SystemClockCounter = ts.tv_sec * 1e9 + ts.tv_nsec; + Counters->SystemClockFrequencyHz = 1000000000; + + return result; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/topology.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/topology.cpp new file mode 100644 index 0000000000..2db712e341 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/topology.cpp @@ -0,0 +1,1463 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * Copyright 2016-2018 Raptor Engineering, LLC. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "impl/wddm/types.h" +#include "impl/wddm/device.h" +#include "util/utils.h" + +/* Number of memory banks added by thunk on top of topology + * This only includes static heaps like LDS, scratch and SVM, + * not for MMIO_REMAP heap. MMIO_REMAP memory bank is reported + * dynamically based on whether mmio aperture was mapped + * successfully on this node. + */ +#define NUM_OF_IGPU_HEAPS 3 +#define NUM_OF_DGPU_HEAPS 3 + +typedef struct { + HsaNodeProperties node; + std::vector mem; /* node->NumBanks elements */ + std::vector cache; + std::vector link; +} node_props_t; + +struct _topology_props { + HsaSystemProperties *g_system = nullptr; + std::vector g_props; + std::vector wdevices_; + uint32_t wdevice_num_ = 0; + uint32_t num_sysfs_nodes = 0; + int processor_vendor = -1; + double freq_max_ = 0.0; +}; + +static _topology_props* dxg_topology = new _topology_props(); + +/* Supported System Vendors */ +enum SUPPORTED_PROCESSOR_VENDORS { + GENUINE_INTEL = 0, + AUTHENTIC_AMD, + IBM_POWER +}; +/* Adding newline to make the search easier */ +static const char *supported_processor_vendor_name[] = { + "GenuineIntel", + "AuthenticAMD", + "" // POWER requires a different search method +}; + +static HSAKMT_STATUS topology_take_snapshot(void); +static void topology_drop_snapshot(void); + +/* information from /proc/cpuinfo */ +struct proc_cpuinfo { + uint32_t proc_num; /* processor */ + uint32_t apicid; /* apicid */ + char model_name[HSA_PUBLIC_NAME_SIZE]; /* model name */ +}; + +/* CPU cache table for all CPUs on the system. Each entry has the relative CPU + * info and caches connected to that CPU. + */ +typedef struct cpu_cacheinfo { + int32_t proc_num; /* this cpu's processor number */ + uint32_t num_caches; /* number of caches reported by this cpu */ +} cpu_cacheinfo_t; + +/* num_subdirs - find the number of sub-directories in the specified path + * @dirpath - directory path to find sub-directories underneath + * @prefix - only count sub-directory names starting with prefix. + * Use blank string, "", to count all. + * Return - number of sub-directories + */ +static int num_subdirs(char *dirpath, const char *prefix) { + int count = 0; + DIR *dirp; + struct dirent *dir; + int prefix_len = strlen(prefix); + + dirp = opendir(dirpath); + if (dirp) { + while ((dir = readdir(dirp)) != 0) { + if ((strcmp(dir->d_name, ".") == 0) || (strcmp(dir->d_name, "..") == 0)) + continue; + if (prefix_len && strncmp(dir->d_name, prefix, prefix_len)) + continue; + count++; + } + closedir(dirp); + } + + return count; +} + +/* fscanf_dec - read a file whose content is a decimal number + * @file [IN ] file to read + * @num [OUT] number in the file + */ +static HSAKMT_STATUS fscanf_dec(char *file, uint32_t *num) { + FILE *fd; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + fd = fopen(file, "r"); + if (!fd) { + pr_err("Failed to open %s\n", file); + return HSAKMT_STATUS_INVALID_PARAMETER; + } + if (fscanf(fd, "%u", num) != 1) { + pr_err("Failed to parse %s as a decimal.\n", file); + ret = HSAKMT_STATUS_ERROR; + } + + fclose(fd); + return ret; +} + +/* fscanf_str - read a file whose content is a string + * @file [IN ] file to read + * @str [OUT] string in the file + */ +static HSAKMT_STATUS fscanf_str(char *file, char *str) { + FILE *fd; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + fd = fopen(file, "r"); + if (!fd) { + pr_err("Failed to open %s\n", file); + return HSAKMT_STATUS_INVALID_PARAMETER; + } + if (fscanf(fd, "%s", str) != 1) { + pr_err("Failed to parse %s as a string.\n", file); + ret = HSAKMT_STATUS_ERROR; + } + + fclose(fd); + return ret; +} + +/* fscanf_size - read a file whose content represents size as a string + * @file [IN ] file to read + * @bytes [OUT] sizes in bytes + */ +static HSAKMT_STATUS fscanf_size(char *file, uint32_t *bytes) { + FILE *fd; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + char unit; + int n; + + fd = fopen(file, "r"); + if (!fd) { + pr_err("Failed to open %s\n", file); + return HSAKMT_STATUS_INVALID_PARAMETER; + } + + n = fscanf(fd, "%u%c", bytes, &unit); + if (n < 1) { + pr_err("Failed to parse %s\n", file); + ret = HSAKMT_STATUS_ERROR; + } + + if (n == 2) { + switch (unit) { + case 'K': + *bytes <<= 10; + break; + case 'M': + *bytes <<= 20; + break; + case 'G': + *bytes <<= 30; + break; + default: + ret = HSAKMT_STATUS_ERROR; + break; + } + } + + fclose(fd); + return ret; +} + +/* cpumap_to_cpu_ci - translate shared_cpu_map string + cpuinfo->apicid into + * SiblingMap in cache + * @shared_cpu_map [IN ] shared_cpu_map string + * @cpuinfo [IN ] cpuinfo to get apicid + * @this_cache [OUT] CPU cache to fill in SiblingMap + */ +static void cpumap_to_cpu_ci(char *shared_cpu_map, + const std::vector& cpuinfo, + HsaCacheProperties *this_cache) { + int num_hexs, bit; + uint32_t proc, apicid, mask; + char *ch_ptr; + + /* shared_cpu_map is shown as ...X3,X2,X1 Each X is a hex without 0x + * and it's up to 8 characters(32 bits). For the first 32 CPUs(actually + * procs), it's presented in X1. The next 32 is in X2, and so on. + */ + num_hexs = (strlen(shared_cpu_map) + 8) / 9; /* 8 characters + "," */ + ch_ptr = strtok(shared_cpu_map, ","); + while (num_hexs-- > 0) { + mask = strtol(ch_ptr, NULL, 16); /* each X */ + for (bit = 0; bit < 32; bit++) { + if (!((1 << bit) & mask)) + continue; + proc = num_hexs * 32 + bit; + apicid = cpuinfo[proc].apicid; + if (apicid >= HSA_CPU_SIBLINGS) { + pr_warn("SiblingMap buffer %d is too small\n", HSA_CPU_SIBLINGS); + continue; + } + this_cache->SiblingMap[apicid] = 1; + } + ch_ptr = strtok(NULL, ","); + } +} + +/* get_cpu_cache_info - get specified CPU's cache information from sysfs + * @prefix [IN] sysfs path for target cpu cache, + * /sys/devices/system/node/nodeX/cpuY/cache + * @cpuinfo [IN] /proc/cpuinfo data to get apicid + * @cpu_ci: CPU specified. This parameter is an input and also an output. + * [IN] cpu_ci->num_caches: number of index dirs + * [OUT] cpu_ci->cache_info: to store cache info collected + * [OUT] cpu_ci->num_caches: reduces when shared with other cpu(s) + * Return: number of cache reported from this cpu + */ +static int get_cpu_cache_info(const char *prefix, + const std::vector& cpuinfo, + std::vector& cache, + cpu_cacheinfo_t& cpu_ci) { + int n; + char path[256], str[256]; + bool is_power9 = false; + + if (dxg_topology->processor_vendor == IBM_POWER) { + if (strcmp(cpuinfo[0].model_name, "POWER9") == 0) { + is_power9 = true; + } + } + + HsaCacheProperties this_cache; + int num_idx = cpu_ci.num_caches; + for (int idx = 0; idx < num_idx; idx++) { + memset(&this_cache, 0, sizeof(this_cache)); + /* If this cache is shared by multiple CPUs, we only need + * to list it in the first CPU. + */ + if (is_power9) { + // POWER9 has SMT4 + if (cpu_ci.proc_num & 0x3) { + /* proc is not 0,4,8,etc. Skip and reduce the cache count. */ + --cpu_ci.num_caches; + continue; + } + } else { + snprintf(path, 256, "%s/index%d/shared_cpu_list", prefix, idx); + /* shared_cpu_list is shown as n1,n2... or n1-n2,n3-n4... + * For both cases, this cache is listed to proc n1 only. + */ + fscanf_dec(path, (uint32_t *)&n); + if (cpu_ci.proc_num != n) { + /* proc is not n1. Skip and reduce the cache count. */ + --cpu_ci.num_caches; + continue; + } + this_cache.ProcessorIdLow = cpuinfo[cpu_ci.proc_num].apicid; + } + + /* CacheLevel */ + snprintf(path, 256, "%s/index%d/level", prefix, idx); + fscanf_dec(path, &this_cache.CacheLevel); + /* CacheType */ + snprintf(path, 256, "%s/index%d/type", prefix, idx); + + memset(str, 0, sizeof(str)); + fscanf_str(path, str); + if (!strcmp(str, "Data")) + this_cache.CacheType.ui32.Data = 1; + if (!strcmp(str, "Instruction")) + this_cache.CacheType.ui32.Instruction = 1; + if (!strcmp(str, "Unified")) { + this_cache.CacheType.ui32.Data = 1; + this_cache.CacheType.ui32.Instruction = 1; + } + this_cache.CacheType.ui32.CPU = 1; + /* CacheSize */ + snprintf(path, 256, "%s/index%d/size", prefix, idx); + fscanf_size(path, &this_cache.CacheSize); + /* CacheLineSize */ + snprintf(path, 256, "%s/index%d/coherency_line_size", prefix, idx); + fscanf_dec(path, &this_cache.CacheLineSize); + /* CacheAssociativity */ + snprintf(path, 256, "%s/index%d/ways_of_associativity", prefix, idx); + fscanf_dec(path, &this_cache.CacheAssociativity); + /* CacheLinesPerTag */ + snprintf(path, 256, "%s/index%d/physical_line_partition", prefix, idx); + fscanf_dec(path, &this_cache.CacheLinesPerTag); + /* CacheSiblings */ + snprintf(path, 256, "%s/index%d/shared_cpu_map", prefix, idx); + fscanf_str(path, str); + cpumap_to_cpu_ci(str, cpuinfo, &this_cache); + + cache.push_back(this_cache); + } + + return cpu_ci.num_caches; +} + +static HSAKMT_STATUS topology_map_node_id(uint32_t node_id, + wsl::thunk::WDDMDevice *&device) { + uint32_t idx = node_id; + if ((!dxg_topology->wdevices_.size()) || (!node_id) || (node_id >= dxg_topology->num_sysfs_nodes)) { + device = nullptr; + return HSAKMT_STATUS_ERROR; + } + + device = dxg_topology->wdevices_[node_id - 1]; + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS topology_sysfs_get_system_props(HsaSystemProperties& props) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + bool is_node_supported = true; + uint32_t num_supported_nodes = 0; + + std::memset(&props, 0, sizeof(props)); + + dxg_runtime->HeapFini(); + for (auto device : dxg_topology->wdevices_) + delete device; + dxg_topology->wdevices_.clear(); + + WDDMCreateDevices(dxg_topology->wdevices_); + int num_adapters = dxg_topology->wdevices_.size(); + if (num_adapters == 0) { + pr_err("No WDDM adapters found.\n"); + return HSAKMT_STATUS_ERROR; + } + + dxg_topology->num_sysfs_nodes = num_adapters + 1; + dxg_runtime->HeapInit(); + props.NumNodes = dxg_topology->num_sysfs_nodes; + if (dxg_runtime->default_node > num_adapters) + dxg_runtime->default_node = num_adapters; + + return ret; +} + +void topology_setup_is_dgpu_param(HsaNodeProperties *props) { + /* if we found a dGPU node, then treat the whole system as dGPU */ + /* noted that some APUs are also treated as dGPU in runtime */ + if (!props->NumCPUCores && props->NumFComputeCores) + dxg_runtime->hsakmt_is_dgpu = true; +} + +static HSAKMT_STATUS topology_get_cpu_model_name(HsaNodeProperties& props, + const std::vector& cpuinfo) { + for (int i = 0; i < cpuinfo.size(); i++) { + if (props.CComputeIdLo == cpuinfo[i].apicid) { + if (!props.DeviceId) /* CPU-only node */ + strncpy((char *)props.AMDName, cpuinfo[i].model_name, + sizeof(props.AMDName)); + /* Convert from UTF8 to UTF16 */ + int j; + for (j = 0; + cpuinfo[i].model_name[j] != '\0' && j < HSA_PUBLIC_NAME_SIZE - 1; j++) + props.MarketingName[j] = cpuinfo[i].model_name[j]; + props.MarketingName[j] = '\0'; + return HSAKMT_STATUS_SUCCESS; + } + } + + return HSAKMT_STATUS_ERROR; +} + +static int topology_search_processor_vendor(const std::string& processor_name) { + for (unsigned int i = 0; i < ARRAY_LEN(supported_processor_vendor_name); i++) { + if (processor_name == supported_processor_vendor_name[i]) + return i; + if (processor_name == "POWER9, altivec supported") + return IBM_POWER; + } + return -1; +} + +/* topology_parse_cpuinfo - Parse /proc/cpuinfo and fill up required + * topology information + * cpuinfo [OUT]: output buffer to hold cpu information + * num_procs: number of processors the output buffer can hold + */ +static HSAKMT_STATUS topology_parse_cpuinfo(std::vector& cpuinfo) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + uint32_t num_procs = cpuinfo.size(); + + std::ifstream cpuinfo_max_freq( + "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq"); + if (cpuinfo_max_freq) { + std::string line; + std::getline(cpuinfo_max_freq, line); + dxg_topology->freq_max_ = static_cast(std::stod(line) / 1000); + } + + std::ifstream cpuinfo_file("/proc/cpuinfo"); + if (!cpuinfo_file) { + pr_err("Failed to open /proc/cpuinfo. Unable to get CPU information"); + return HSAKMT_STATUS_ERROR; + } + + std::string line; + uint32_t proc = 0; + while (std::getline(cpuinfo_file, line)) { + if (line.substr(0, 9) == "processor") { + proc = std::stoi(line.substr(line.find(':') + 2)); + if (proc >= num_procs) { + pr_err("cpuinfo contains processor %d larger than %u\n", proc, num_procs); + return HSAKMT_STATUS_NO_MEMORY; + } + continue; + } + + if (line.substr(0, 9) == "vendor_id" && dxg_topology->processor_vendor == -1) { + std::string vendor = line.substr(line.find(':') + 2); + dxg_topology->processor_vendor = topology_search_processor_vendor(vendor.c_str()); + continue; + } + + if (line.substr(0, 10) == "model name") { + std::string model_name = line.substr(line.find(':') + 2); + if (model_name.size() > HSA_PUBLIC_NAME_SIZE) + model_name.resize(HSA_PUBLIC_NAME_SIZE); + std::strncpy(cpuinfo[proc].model_name, model_name.c_str(), HSA_PUBLIC_NAME_SIZE); + continue; + } + + if (line.substr(0, 6) == "apicid") { + cpuinfo[proc].apicid = std::stoi(line.substr(line.find(':') + 2)); + continue; + } + + if (!cpuinfo_max_freq) { + if (line.substr(0, 7) == "cpu MHz") { + double freq = std::stod(line.substr(line.find(':') + 2)); + if (freq > dxg_topology->freq_max_) { + dxg_topology->freq_max_ = freq; + } + continue; + } + } + } + + if (dxg_topology->processor_vendor < 0) { + pr_err("Failed to get Processor Vendor. Setting to %s", supported_processor_vendor_name[GENUINE_INTEL]); + dxg_topology->processor_vendor = GENUINE_INTEL; + } + + return ret; +} + +static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id, + HsaNodeProperties& props, + bool& p2p_links, + uint32_t& num_p2pLinks) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + memset(&props, 0, sizeof(props)); + p2p_links = false; + num_p2pLinks = 0; + + props.MaxEngineClockMhzCCompute = dxg_topology->freq_max_; + + if (node_id == 0) { + /* CPU node */ + props.NumCPUCores = sysconf(_SC_NPROCESSORS_ONLN); + props.NumMemoryBanks = 1; + props.KFDGpuID = 0; + return HSAKMT_STATUS_SUCCESS; + } + + /* gpu node */ + wsl::thunk::WDDMDevice *device; + ret = topology_map_node_id(node_id, device); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + + props.NumCPUCores = 0; + props.NumFComputeCores = device->SimdPerCu() * device->ComputeUnitCount(); + props.NumMemoryBanks = 1; + props.NumCaches = 3; + props.NumIOLinks = 1; + props.CComputeIdLo = 0; + props.FComputeIdLo = 0; + props.Capability.ui32.ASICRevision = device->AsicRevision(); + props.Capability.ui32.WatchPointsTotalBits = + std::log2(device->WatchPointsNum()); + props.MaxWavesPerSIMD = device->WavePerCu() / device->SimdPerCu(); + props.LDSSizeInKB = device->LdsSize() / 1024; + props.GDSSizeInKB = 0; + props.WaveFrontSize = device->WavefrontSize(); + props.NumShaderBanks = device->NumShaderEngine(); + props.NumArrays = device->ShaderArrayPerShaderEngine(); + props.NumCUPerArray = device->ComputeUnitCount() / props.NumArrays; + props.NumSIMDPerCU = device->SimdPerCu(); + props.MaxSlotsScratchCU = device->MaxScratchSlotsPerCu(); + props.VendorId = 0x1002; + props.DeviceId = device->DeviceId(); + props.LocationId = device->PciBusAddr(); + props.LocalMemSize = 0; + props.MaxEngineClockMhzFCompute = device->MaxEngineClockMhz(); + props.DrmRenderMinor = node_id; + + { + int i; + const char *name = device->ProductName(); + for (i = 0; name[i] != 0 && i < HSA_PUBLIC_NAME_SIZE - 1; i++) + props.MarketingName[i] = name[i]; + props.MarketingName[i] = '\0'; + } + props.uCodeEngineVersions.uCodeSDMA = device->GetSdmaFwVersion(); + props.DebugProperties.Value = 0; + props.HiveID = 0; + props.NumSdmaEngines = device->NumSdmaEngine(); + props.NumSdmaXgmiEngines = 0; + props.NumSdmaQueuesPerEngine = 6; // TODO + props.NumCpQueues = device->GetNumCpQueues(); + props.NumGws = 0; + /* + * In Native Linux, if the asic is APU, this value will be set to 1, + * if the asic is dGPU, this value will be set to 0. clr use this info + * to set hostUnifiedMemory_, but for now wsl does not support this feature. + * Therefore, fore vaule to 0 temporarily. + */ + props.Integrated = 0; + props.Domain = device->Domain(); + props.UniqueID = device->Uuid(); + props.NumXcc = 1; + props.KFDGpuID = device->DeviceId(); // TODO + props.FamilyID = device->GfxFamily(); + + props.EngineId.ui32.uCode = device->GetMecFwVersion(); + char *envvar = getenv("HSA_OVERRIDE_GFX_VERSION"); + if (envvar) { + char dummy = '\0'; + uint32_t major = 0, minor = 0, step = 0; + /* HSA_OVERRIDE_GFX_VERSION=major.minor.stepping */ + if ((sscanf(envvar, "%u.%u.%u%c", &major, &minor, &step, &dummy) != 3) || + (major > 63 || minor > 255 || step > 255)) { + pr_err("HSA_OVERRIDE_GFX_VERSION %s is invalid\n", envvar); + return HSAKMT_STATUS_ERROR; + } + props.OverrideEngineId.ui32.Major = major & 0x3f; + props.OverrideEngineId.ui32.Minor = minor & 0xff; + props.OverrideEngineId.ui32.Stepping = step & 0xff; + } + props.EngineId.ui32.Major = device->Major(); + props.EngineId.ui32.Minor = device->Minor(); + props.EngineId.ui32.Stepping = device->Stepping(); + + snprintf((char *)props.AMDName, sizeof(props.AMDName) - 1, "GFX%06x", + HSA_GET_GFX_VERSION_FULL(props.EngineId.ui32)); + + if (!dxg_runtime->is_svm_api_supported) + props.Capability.ui32.SVMAPISupported = 0; + props.Capability.ui32.DoorbellType = 2; + + /* Get VGPR/SGPR size in byte per CU */ + props.SGPRSizePerCU = SGPR_SIZE_PER_CU; + props.VGPRSizePerCU = get_vgpr_size_per_cu(props.EngineId); + + if (props.NumFComputeCores) + assert(props.EngineId.ui32.Major && + "HSA_OVERRIDE_GFX_VERSION may be needed"); + + return ret; +} + +static HSAKMT_STATUS topology_sysfs_get_mem_props(uint32_t node_id, + uint32_t mem_id, + HsaMemoryProperties& props) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + std::memset(&props, 0, sizeof(props)); + if (node_id == 0) { + /* CPU node */ + props.HeapType = HSA_HEAPTYPE_SYSTEM; + + struct sysinfo info; + sysinfo(&info); + props.SizeInBytes = info.totalram; + + /* props.SizeInBytes is the actual physical system + * memory size. Reserve 1/16th for WSL system usage. + */ + dxg_runtime->max_single_alloc_size = info.totalram - (info.totalram >> 4); + + props.Flags.MemoryProperty = 0; + /* TODO: sudo dmidecode --type memory doesn't work on wsl */ + props.Width = 64; + props.MemoryClockMax = 2133; + return HSAKMT_STATUS_SUCCESS; + } + + wsl::thunk::WDDMDevice *device; + ret = topology_map_node_id(node_id, device); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + + props.HeapType = HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE; + + if (device->IsDgpu()) + props.SizeInBytes = device->LocalHeapSize(); + else + props.SizeInBytes = device->NonLocalHeapSize(); + + props.Width = device->MemoryBusWidth(); + props.MemoryClockMax = device->MaxMemoryClockMhz(); + + return ret; +} + +/* topology_get_cpu_cache_props - Read CPU cache information from sysfs + * @node [IN] CPU node number + * @cpuinfo [IN] /proc/cpuinfo data + * @tbl [OUT] the node table to fill up + * Return: HSAKMT_STATUS_SUCCESS in success or error number in failure + */ +static HSAKMT_STATUS topology_get_cpu_cache_props(int node, + const std::vector& cpuinfo, + node_props_t& tbl) { + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + + /* Get max path size from /sys/devices/system/node/node%d/%s/cache + * below, which will max out according to the largest filename, + * which can be present twice in the string above. 29 is for the prefix + * and the +6 is for the cache suffix + */ +#ifndef MAXNAMLEN +/* MAXNAMLEN is the BSD name for NAME_MAX. glibc aliases this as NAME_MAX, but + * not musl */ +#define MAXNAMLEN NAME_MAX +#endif + constexpr uint32_t MAXPATHSIZE = 29 + MAXNAMLEN + (MAXNAMLEN + 6); + char path[MAXPATHSIZE], node_dir[MAXPATHSIZE]; + int max_cpus; + int cache_cnt = 0; + DIR *dirp = NULL; + struct dirent *dir; + char *p; + + /* Get info from /sys/devices/system/node/nodeX/cpuY/cache */ + int node_real = node; + if (dxg_topology->processor_vendor == IBM_POWER) { + if (!strcmp(cpuinfo[0].model_name, "POWER9")) { + node_real = node * 8; + } + } + snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/node/node%d", node_real); + /* Other than cpuY folders, this dir also has cpulist and cpumap */ + max_cpus = num_subdirs(node_dir, "cpu"); + if (max_cpus <= 0) { + /* If CONFIG_NUMA is not enabled in the kernel, + * /sys/devices/system/node doesn't exist. + */ + if (node) { /* CPU node must be 0 or something is wrong */ + pr_err("Fail to get cpu* dirs under %s.", node_dir); + ret = HSAKMT_STATUS_ERROR; + goto exit; + } + /* Fall back to use /sys/devices/system/cpu */ + snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/cpu"); + max_cpus = num_subdirs(node_dir, "cpu"); + if (max_cpus <= 0) { + pr_err("Fail to get cpu* dirs under %s\n", node_dir); + ret = HSAKMT_STATUS_ERROR; + goto exit; + } + } + + dirp = opendir(node_dir); + while ((dir = readdir(dirp)) != 0) { + if (strncmp(dir->d_name, "cpu", 3)) + continue; + if (!isdigit(dir->d_name[3])) /* ignore files like cpulist */ + continue; + if (strlen(node_dir) + strlen(dir->d_name) + strlen("/cache") + 2 < MAXPATHSIZE) { + std::string path_str = std::string(node_dir) + "/" + dir->d_name + "/cache"; + strncpy(path, path_str.c_str(), MAXPATHSIZE); + path[MAXPATHSIZE - 1] = '\0'; + } else { + pr_err("Path is too long and was truncated.\n"); + goto exit; + } + + cpu_cacheinfo_t cpu_ci; + cpu_ci.num_caches = num_subdirs(path, "index"); + cpu_ci.proc_num= atoi(dir->d_name+3); + + cache_cnt += get_cpu_cache_info(path, cpuinfo, tbl.cache, cpu_ci); + } + assert(cache_cnt == tbl.cache.size()); + tbl.node.NumCaches = cache_cnt; + +exit: + if (dirp) + closedir(dirp); + return ret; +} + +/* For a give Node @node_id the function gets @iolink_id information i.e. parses + * sysfs the following sysfs entry + * ./nodes/@node_id/io_links/@iolink_id/properties. @node_id has to be valid + * accessible node. + * + * If node_to specified by the @iolink_id is not accessible the function returns + * HSAKMT_STATUS_NOT_SUPPORTED. If node_to is accessible, then node_to is mapped + * from sysfs_node to user_node and returns HSAKMT_STATUS_SUCCESS. + */ +static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id, + uint32_t iolink_id, + HsaIoLinkProperties& props, + bool p2pLink) { + wsl::thunk::WDDMDevice *device; + topology_map_node_id(node_id, device); + + std::memset(&props, 0, sizeof(props)); + props.IoLinkType = HSA_IOLINKTYPE_PCIEXPRESS; + props.VersionMajor = props.VersionMinor = 0; + props.NodeFrom = node_id; + props.NodeTo = 0; + props.Weight = 20; + props.Flags.ui32.Override = 1; + props.Flags.ui32.NonCoherent = 1; + props.Flags.ui32.NoAtomics32bit = !(device->SupportPlatformAtomic()); + props.Flags.ui32.NoAtomics64bit = !(device->SupportPlatformAtomic()); + props.RecSdmaEngIdMask = 0; + + return HSAKMT_STATUS_SUCCESS; +} + +/* topology_get_free_io_link_slot_for_node - For the given node_id, find the + * next available free slot to add an io_link + */ +static HsaIoLinkProperties * +topology_get_free_io_link_slot_for_node(uint32_t node_id, + const HsaSystemProperties& sys_props, + std::vector& node_props) { + std::vector& props = node_props[node_id].link; + + if (node_id >= sys_props.NumNodes) { + pr_err("Invalid node [%d]\n", node_id); + return NULL; + } + + if (!props.size()) { + pr_err("No io_link reported for Node [%d]\n", node_id); + return NULL; + } + + if (node_props[node_id].node.NumIOLinks >= sys_props.NumNodes - 1) { + pr_err("No more space for io_link for Node [%d]\n", node_id); + return NULL; + } + + return &props[node_props[node_id].node.NumIOLinks]; +} + +/* topology_add_io_link_for_node - If a free slot is available, + * add io_link for the given Node. + * TODO: Add other members of HsaIoLinkProperties + */ +static HSAKMT_STATUS topology_add_io_link_for_node( + uint32_t node_from, const HsaSystemProperties& sys_props, + std::vector& node_props, HSA_IOLINKTYPE IoLinkType, uint32_t node_to, + uint32_t Weight) { + HsaIoLinkProperties *props; + + props = + topology_get_free_io_link_slot_for_node(node_from, sys_props, node_props); + if (!props) + return HSAKMT_STATUS_NO_MEMORY; + + props->IoLinkType = IoLinkType; + props->NodeFrom = node_from; + props->NodeTo = node_to; + props->Weight = Weight; + node_props[node_from].node.NumIOLinks++; + + return HSAKMT_STATUS_SUCCESS; +} + +/* Find the CPU that this GPU (gpu_node) directly connects to */ +static int32_t gpu_get_direct_link_cpu(uint32_t gpu_node, + const std::vector& node_props) { + const std::vector& props = node_props[gpu_node].link; + uint32_t i; + + if (!node_props[gpu_node].node.KFDGpuID || props.empty() || + node_props[gpu_node].node.NumIOLinks == 0) + return -1; + + for (i = 0; i < node_props[gpu_node].node.NumIOLinks; i++) + if (props[i].IoLinkType == HSA_IOLINKTYPE_PCIEXPRESS && + props[i].Weight <= 20) /* >20 is GPU->CPU->GPU */ + return props[i].NodeTo; + + return -1; +} + +/* Get node1->node2 IO link information. This should be a direct link that has + * been created in the kernel. + */ +static HSAKMT_STATUS get_direct_iolink_info(uint32_t node1, uint32_t node2, + const std::vector& node_props, + HSAuint32 *weight, + HSA_IOLINKTYPE *type) { + const std::vector& props = node_props[node1].link; + uint32_t i; + + if (!props.size()) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + + for (i = 0; i < node_props[node1].node.NumIOLinks; i++) + if (props[i].NodeTo == node2) { + if (weight) + *weight = props[i].Weight; + if (type) + *type = props[i].IoLinkType; + return HSAKMT_STATUS_SUCCESS; + } + + return HSAKMT_STATUS_INVALID_PARAMETER; +} + +static HSAKMT_STATUS get_indirect_iolink_info(uint32_t node1, uint32_t node2, + const std::vector& node_props, + HSAuint32 *weight, + HSA_IOLINKTYPE *type) { + int32_t dir_cpu1 = -1, dir_cpu2 = -1; + HSAKMT_STATUS ret; + uint32_t i; + + *weight = 0; + *type = HSA_IOLINKTYPE_UNDEFINED; + + if (node1 == node2) + return HSAKMT_STATUS_INVALID_PARAMETER; + + /* CPU->CPU is not an indirect link */ + if (!node_props[node1].node.KFDGpuID && !node_props[node2].node.KFDGpuID) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + + if (node_props[node1].node.HiveID && node_props[node2].node.HiveID && + node_props[node1].node.HiveID == node_props[node2].node.HiveID) + return HSAKMT_STATUS_INVALID_PARAMETER; + + if (node_props[node1].node.KFDGpuID) + dir_cpu1 = gpu_get_direct_link_cpu(node1, node_props); + if (node_props[node2].node.KFDGpuID) + dir_cpu2 = gpu_get_direct_link_cpu(node2, node_props); + + if (dir_cpu1 < 0 && dir_cpu2 < 0) + return HSAKMT_STATUS_ERROR; + + /* if the node2(dst) is GPU , it need to be large bar for host access*/ + if (node_props[node2].node.KFDGpuID) { + for (i = 0; i < node_props[node2].node.NumMemoryBanks; ++i) + if (node_props[node2].mem[i].HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC) + break; + if (i >= node_props[node2].node.NumMemoryBanks) + return HSAKMT_STATUS_ERROR; + } + /* Possible topology: + * GPU --(weight1) -- CPU -- (weight2) -- GPU + * GPU --(weight1) -- CPU -- (weight2) -- CPU -- (weight3) -- GPU + * GPU --(weight1) -- CPU -- (weight2) -- CPU + * CPU -- (weight2) -- CPU -- (weight3) -- GPU + */ + HSAuint32 weight1 = 0, weight2 = 0, weight3 = 0; + if (dir_cpu1 >= 0) { /* GPU->CPU ... */ + if (dir_cpu2 >= 0) { + if (dir_cpu1 == dir_cpu2) /* GPU->CPU->GPU*/ { + ret = + get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = + get_direct_iolink_info(dir_cpu1, node2, node_props, &weight2, type); + } else /* GPU->CPU->CPU->GPU*/ { + ret = + get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = get_direct_iolink_info(dir_cpu1, dir_cpu2, node_props, &weight2, + type); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + /* On QPI interconnection, GPUs can't access + * each other if they are attached to different + * CPU sockets. CPU<->CPU weight larger than 20 + * means the two CPUs are in different sockets. + */ + if (*type == HSA_IOLINK_TYPE_QPI_1_1 && weight2 > 20) + return HSAKMT_STATUS_NOT_SUPPORTED; + ret = + get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3, NULL); + } + } else /* GPU->CPU->CPU */ { + ret = get_direct_iolink_info(node1, dir_cpu1, node_props, &weight1, NULL); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = get_direct_iolink_info(dir_cpu1, node2, node_props, &weight2, type); + } + } else { /* CPU->CPU->GPU */ + ret = get_direct_iolink_info(node1, dir_cpu2, node_props, &weight2, type); + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + ret = get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3, NULL); + } + + if (ret != HSAKMT_STATUS_SUCCESS) + return ret; + + *weight = weight1 + weight2 + weight3; + return HSAKMT_STATUS_SUCCESS; +} + +static void +topology_create_indirect_gpu_links(const HsaSystemProperties& sys_props, + std::vector& node_props) { + + uint32_t i, j; + HSAuint32 weight; + HSA_IOLINKTYPE type; + + for (i = 0; i < sys_props.NumNodes - 1; i++) { + for (j = i + 1; j < sys_props.NumNodes; j++) { + get_indirect_iolink_info(i, j, node_props, &weight, &type); + if (!weight) + goto try_alt_dir; + if (topology_add_io_link_for_node(i, sys_props, node_props, type, j, + weight) != HSAKMT_STATUS_SUCCESS) + pr_err("Fail to add IO link %d->%d\n", i, j); + try_alt_dir: + get_indirect_iolink_info(j, i, node_props, &weight, &type); + if (!weight) + continue; + if (topology_add_io_link_for_node(j, sys_props, node_props, type, i, + weight) != HSAKMT_STATUS_SUCCESS) + pr_err("Fail to add IO link %d->%d\n", j, i); + } + } +} + +HSAKMT_STATUS topology_take_snapshot(void) { + uint32_t i, mem_id, cache_id; + HsaSystemProperties sys_props; + std::vector& temp_props = dxg_topology->g_props; + HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; + const uint32_t num_procs = sysconf(_SC_NPROCESSORS_ONLN); + std::vector cpuinfo(num_procs); + uint32_t num_ioLinks; + bool p2p_links = false; + uint32_t num_p2pLinks = 0; + + topology_parse_cpuinfo(cpuinfo); + + ret = topology_sysfs_get_system_props(sys_props); + if (ret != HSAKMT_STATUS_SUCCESS) + goto err; + if (sys_props.NumNodes > 0) { + temp_props.resize(sys_props.NumNodes); + + for (i = 0; i < sys_props.NumNodes; i++) { + wsl::thunk::WDDMDevice *device_; + topology_map_node_id(i, device_); + + ret = topology_sysfs_get_node_props(i, temp_props[i].node, p2p_links, + num_p2pLinks); + if (ret != HSAKMT_STATUS_SUCCESS) { + goto err; + } + + topology_setup_is_dgpu_param(&temp_props[i].node); + + if (temp_props[i].node.NumCPUCores) + topology_get_cpu_model_name(temp_props[i].node, cpuinfo); + + if (temp_props[i].node.NumMemoryBanks) { + temp_props[i].mem.resize(temp_props[i].node.NumMemoryBanks); + + for (mem_id = 0; mem_id < temp_props[i].node.NumMemoryBanks; mem_id++) { + ret = topology_sysfs_get_mem_props(i, mem_id, + temp_props[i].mem[mem_id]); + if (ret != HSAKMT_STATUS_SUCCESS) { + goto err; + } + } + } + + if (temp_props[i].node.NumCaches) { + temp_props[i].cache.resize(temp_props[i].node.NumCaches); + for (int j = 0; j < 3; j++) { + temp_props[i].cache[j].CacheType.ui32.Data = 1; + temp_props[i].cache[j].CacheType.ui32.HSACU = 1; + temp_props[i].cache[j].CacheLevel = j + 1; + } + temp_props[i].cache[0].CacheSize = device_->GetL1CacheSize() / 1024; + temp_props[i].cache[1].CacheSize = device_->GetL2CacheSize() / 1024; + temp_props[i].cache[2].CacheSize = device_->GetL3CacheSize() / 1024; + } else if (!temp_props[i].node.KFDGpuID) { /* a CPU node */ + ret = topology_get_cpu_cache_props(i, cpuinfo, temp_props[i]); + if (ret != HSAKMT_STATUS_SUCCESS) { + goto err; + } + } + + /* To simplify, allocate maximum needed memory for io_links for each node. + * This removes the need for realloc when indirect and QPI links are added + * later + */ + temp_props[i].link.resize(sys_props.NumNodes - 1); + num_ioLinks = temp_props[i].node.NumIOLinks - num_p2pLinks; + uint32_t link_id = 0; + + if (num_ioLinks) { + uint32_t sys_link_id = 0; + + /* Parse all the sysfs specified io links. Skip the ones where the + * remote node (node_to) is not accessible + */ + while (sys_link_id < num_ioLinks && link_id < sys_props.NumNodes - 1) { + ret = topology_sysfs_get_iolink_props( + i, sys_link_id++, temp_props[i].link[link_id], false); + if (ret == HSAKMT_STATUS_NOT_SUPPORTED) { + ret = HSAKMT_STATUS_SUCCESS; + continue; + } else if (ret != HSAKMT_STATUS_SUCCESS) { + goto err; + } + link_id++; + } + /* sysfs specifies all the io links. Limit the number to valid ones */ + temp_props[i].node.NumIOLinks = link_id; + } + + if (num_p2pLinks) { + uint32_t sys_link_id = 0; + + /* Parse all the sysfs specified p2p links. + */ + while (sys_link_id < num_p2pLinks && link_id < sys_props.NumNodes - 1) { + ret = topology_sysfs_get_iolink_props( + i, sys_link_id++, temp_props[i].link[link_id], true); + if (ret == HSAKMT_STATUS_NOT_SUPPORTED) { + ret = HSAKMT_STATUS_SUCCESS; + continue; + } else if (ret != HSAKMT_STATUS_SUCCESS) { + goto err; + } + link_id++; + } + temp_props[i].node.NumIOLinks = link_id; + } + } + } + + if (!p2p_links) { + /* All direct IO links are created in the kernel. Here we need to + * connect GPU<->GPU or GPU<->CPU indirect IO links. + */ + topology_create_indirect_gpu_links(sys_props, temp_props); + } + + if (!dxg_topology->g_system) { + dxg_topology->g_system = (HsaSystemProperties *)malloc(sizeof(HsaSystemProperties)); + if (!dxg_topology->g_system) { + ret = HSAKMT_STATUS_NO_MEMORY; + goto err; + } + } + + *dxg_topology->g_system = sys_props; +err: + return ret; +} + +/* Drop the Snashot of the HSA topology information. Assume lock is held. */ +void topology_drop_snapshot(void) { + if (!!dxg_topology->g_system != !!dxg_topology->g_props.size()) + pr_warn("Probably inconsistency?\n"); + + dxg_topology->g_props.clear(); + + free(dxg_topology->g_system); + dxg_topology->g_system = NULL; + + trim_suballocator(); + for (auto device : dxg_topology->wdevices_) + delete device; + dxg_topology->wdevices_.clear(); +} + +HSAKMT_STATUS validate_nodeid(uint32_t nodeid, uint32_t *gpu_id) { + if (dxg_topology->g_props.empty() || !dxg_topology->g_system || dxg_topology->g_system->NumNodes <= nodeid) + return HSAKMT_STATUS_INVALID_NODE_UNIT; + if (gpu_id) + *gpu_id = dxg_topology->g_props[nodeid].node.KFDGpuID; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS gpuid_to_nodeid(uint32_t gpu_id, uint32_t *node_id) { + uint64_t node_idx; + + for (node_idx = 0; node_idx < dxg_topology->g_system->NumNodes; node_idx++) { + if (dxg_topology->g_props[node_idx].node.KFDGpuID == gpu_id) { + *node_id = node_idx; + return HSAKMT_STATUS_SUCCESS; + } + } + + return HSAKMT_STATUS_INVALID_NODE_UNIT; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties) { + HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS; + + CHECK_DXG_OPEN(); + + if (!SystemProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + /* We already have a valid snapshot. Avoid double initialization that + * would leak memory. + */ + if (dxg_topology->g_system) { + *SystemProperties = *dxg_topology->g_system; + goto out; + } + + err = topology_take_snapshot(); + if (err != HSAKMT_STATUS_SUCCESS) + goto out; + + assert(dxg_topology->g_system); + + // err = fmm_init_process_apertures(dxg_topology->g_system->NumNodes); + if (err != HSAKMT_STATUS_SUCCESS) + goto init_process_apertures_failed; + + // err = init_process_doorbells(dxg_topology->g_system->NumNodes); + if (err != HSAKMT_STATUS_SUCCESS) + goto init_doorbells_failed; + + *SystemProperties = *dxg_topology->g_system; + + goto out; + +init_doorbells_failed: + // fmm_destroy_process_apertures(); +init_process_apertures_failed: + topology_drop_snapshot(); + +out: + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + return err; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void) { + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + topology_drop_snapshot(); + + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS topology_get_node_props(HSAuint32 NodeId, + HsaNodeProperties *NodeProperties) { + if (!dxg_topology->g_system || dxg_topology->g_props.empty() || NodeId >= dxg_topology->g_system->NumNodes) + return HSAKMT_STATUS_ERROR; + + *NodeProperties = dxg_topology->g_props[NodeId].node; + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetNodeProperties(HSAuint32 NodeId, HsaNodeProperties *NodeProperties) { + HSAKMT_STATUS err; + uint32_t gpu_id; + + if (!NodeProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_DXG_OPEN(); + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + err = validate_nodeid(NodeId, &gpu_id); + if (err != HSAKMT_STATUS_SUCCESS) + goto out; + + err = topology_get_node_props(NodeId, NodeProperties); + if (err != HSAKMT_STATUS_SUCCESS) + goto out; + /* For CPU only node don't add any additional GPU memory banks. */ + if (gpu_id) { + uint64_t base, limit; + if (!(NodeProperties->Integrated)) + NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS; + else + NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS; + // TODO: for apu + /*if (fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, &base, + &limit) == HSAKMT_STATUS_SUCCESS) + NodeProperties->NumMemoryBanks += 1;*/ + } + +out: + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + return err; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, HSAuint32 NumBanks, + HsaMemoryProperties *MemoryProperties) { + HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS; + uint32_t i; + + if (!MemoryProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_DXG_OPEN(); + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + memset(MemoryProperties, 0, NumBanks * sizeof(HsaMemoryProperties)); + for (i = 0; i < wsl::Min(dxg_topology->g_props[NodeId].node.NumMemoryBanks, NumBanks); i++) { + assert(dxg_topology->g_props[NodeId].mem.size()); + MemoryProperties[i] = dxg_topology->g_props[NodeId].mem[i]; + } + + /* The following memory banks does not apply to CPU only node */ + wsl::thunk::WDDMDevice *device_ = get_wddmdev(NodeId); + if (device_ == nullptr) + goto out; + + /*Add LDS*/ + if (i < NumBanks) { + MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_LDS; + MemoryProperties[i].VirtualBaseAddress = device_->SharedApertureBase(); + MemoryProperties[i].SizeInBytes = dxg_topology->g_props[NodeId].node.LDSSizeInKB * 1024; + i++; + } + + /* Add SCRATCH */ + if (i < NumBanks) { + MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_SCRATCH; + MemoryProperties[i].VirtualBaseAddress = device_->PrivateApertureBase(); + MemoryProperties[i].SizeInBytes = device_->PrivateApertureSize(); + i++; + } + +out: + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + return err; +} + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCacheProperties( + HSAuint32 NodeId, HSAuint32 ProcessorId, HSAuint32 NumCaches, + HsaCacheProperties *CacheProperties) { + HSAKMT_STATUS err; + uint32_t i; + + if (!CacheProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_DXG_OPEN(); + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + /* KFD ADD page 18, snapshot protocol violation */ + if (!dxg_topology->g_system || NodeId >= dxg_topology->g_system->NumNodes) { + err = HSAKMT_STATUS_INVALID_NODE_UNIT; + goto out; + } + + if (NumCaches > dxg_topology->g_props[NodeId].node.NumCaches) { + err = HSAKMT_STATUS_INVALID_PARAMETER; + goto out; + } + + for (i = 0; i < wsl::Min(dxg_topology->g_props[NodeId].node.NumCaches, NumCaches); i++) { + assert(dxg_topology->g_props[NodeId].cache.size()); + CacheProperties[i] = dxg_topology->g_props[NodeId].cache[i]; + } + + err = HSAKMT_STATUS_SUCCESS; + +out: + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + return err; +} + +HSAKMT_STATUS topology_get_iolink_props(HSAuint32 NodeId, HSAuint32 NumIoLinks, + HsaIoLinkProperties *IoLinkProperties) { + if (!dxg_topology->g_system || dxg_topology->g_props.empty() || NodeId >= dxg_topology->g_system->NumNodes) + return HSAKMT_STATUS_ERROR; + + memcpy(IoLinkProperties, dxg_topology->g_props[NodeId].link.data(), + NumIoLinks * sizeof(*IoLinkProperties)); + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS HSAKMTAPI +hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId, HSAuint32 NumIoLinks, + HsaIoLinkProperties *IoLinkProperties) { + HSAKMT_STATUS err; + + if (!IoLinkProperties) + return HSAKMT_STATUS_INVALID_PARAMETER; + + CHECK_DXG_OPEN(); + + pthread_mutex_lock(&dxg_runtime->hsakmt_mutex); + + /* KFD ADD page 18, snapshot protocol violation */ + if (!dxg_topology->g_system || NodeId >= dxg_topology->g_system->NumNodes) { + err = HSAKMT_STATUS_INVALID_NODE_UNIT; + goto out; + } + + if (NumIoLinks > dxg_topology->g_props[NodeId].node.NumIOLinks) { + err = HSAKMT_STATUS_INVALID_PARAMETER; + goto out; + } + + assert(dxg_topology->g_props[NodeId].link.size()); + err = topology_get_iolink_props(NodeId, NumIoLinks, IoLinkProperties); + +out: + pthread_mutex_unlock(&dxg_runtime->hsakmt_mutex); + return err; +} + +uint16_t get_device_id_by_node_id(HSAuint32 node_id) { + if (dxg_topology->g_props.empty() || !dxg_topology->g_system || dxg_topology->g_system->NumNodes <= node_id) + return 0; + + return dxg_topology->g_props[node_id].node.DeviceId; +} + +bool prefer_ats(HSAuint32 node_id) { + return dxg_topology->g_props[node_id].node.Capability.ui32.HSAMMUPresent && + dxg_topology->g_props[node_id].node.NumCPUCores && + dxg_topology->g_props[node_id].node.NumFComputeCores; +} + +uint16_t get_device_id_by_gpu_id(HSAuint32 gpu_id) { + unsigned int i; + + if (dxg_topology->g_props.empty() || !dxg_topology->g_system) + return 0; + + for (i = 0; i < dxg_topology->g_system->NumNodes; i++) { + if (dxg_topology->g_props[i].node.KFDGpuID == gpu_id) + return dxg_topology->g_props[i].node.DeviceId; + } + + return 0; +} + +uint32_t get_direct_link_cpu(uint32_t gpu_node) { + HSAuint64 size = 0; + int32_t cpu_id; + HSAuint32 i; + + cpu_id = gpu_get_direct_link_cpu(gpu_node, dxg_topology->g_props); + if (cpu_id == -1) + return INVALID_NODEID; + + assert(dxg_topology->g_props[cpu_id].mem.size()); + + for (i = 0; i < dxg_topology->g_props[cpu_id].node.NumMemoryBanks; i++) + size += dxg_topology->g_props[cpu_id].mem[i].SizeInBytes; + + return size ? (uint32_t)cpu_id : INVALID_NODEID; +} + +HSAKMT_STATUS validate_nodeid_array(uint32_t **gpu_id_array, + uint32_t NumberOfNodes, + uint32_t *NodeArray) { + HSAKMT_STATUS ret; + unsigned int i; + + if (NumberOfNodes == 0 || !NodeArray || !gpu_id_array) + return HSAKMT_STATUS_INVALID_PARAMETER; + + /* Translate Node IDs to gpu_ids */ + *gpu_id_array = (uint32_t *)malloc(NumberOfNodes * sizeof(uint32_t)); + if (!(*gpu_id_array)) + return HSAKMT_STATUS_NO_MEMORY; + for (i = 0; i < NumberOfNodes; i++) { + ret = validate_nodeid(NodeArray[i], *gpu_id_array + i); + if (ret != HSAKMT_STATUS_SUCCESS) { + free(*gpu_id_array); + break; + } + } + + return ret; +} + +uint32_t get_num_sysfs_nodes(void) { return dxg_topology->num_sysfs_nodes; } + +wsl::thunk::WDDMDevice *get_wddmdev(uint32_t node_id) { + if ((!dxg_topology->wdevices_.size()) || (!node_id) || (node_id >= dxg_topology->num_sysfs_nodes)) + return nullptr; + + return dxg_topology->wdevices_[node_id - 1]; +} + +uint32_t get_num_wddmdev() { + return dxg_topology->wdevices_.size(); +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/atomic_helpers.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/atomic_helpers.h new file mode 100644 index 0000000000..4b7f8b0362 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/atomic_helpers.h @@ -0,0 +1,519 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +/* + Helpers to use native types with C++11 atomic operations. + Fixes GCC builtin functionality for x86 with respect to WC and non-temporal + stores. +*/ +#ifndef HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_ +#define HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_ + +#include +#include "utils.h" + +//ALWAYS_CONSERVATIVE will very likely overfence your code. +//For use as a debugging aid only. +#define ALWAYS_CONSERVATIVE 0 + +#if !ALWAYS_CONSERVATIVE +#if defined(__x86_64__) || defined(_M_X64) +#define X64_ORDER_WC 1 +#endif +#if X64_ORDER_WC +#include +#endif +#endif + +namespace wsl { +namespace atomic { + +static constexpr int c11ToBuiltInFlags(std::memory_order order) +{ +#if ALWAYS_CONSERVATIVE + return __ATOMIC_RELAXED; +#elif X64_ORDER_WC + return __ATOMIC_RELAXED; +#else + return (order == std::memory_order_relaxed) ? __ATOMIC_RELAXED : + (order == std::memory_order_acquire) ? __ATOMIC_ACQUIRE : + (order == std::memory_order_release) ? __ATOMIC_RELEASE : + (order == std::memory_order_seq_cst) ? __ATOMIC_SEQ_CST : + (order == std::memory_order_consume) ? __ATOMIC_CONSUME : + (order == std::memory_order_acq_rel) ? __ATOMIC_ACQ_REL : + __ATOMIC_SEQ_CST; +#endif +} + +static __forceinline void PreFence(std::memory_order order) { +#if ALWAYS_CONSERVATIVE + switch (order) { + case std::memory_order_release: + case std::memory_order_seq_cst: + case std::memory_order_acq_rel: + __atomic_thread_fence(__ATOMIC_SEQ_CST); + default:; + } +#elif X64_ORDER_WC + switch (order) { + case std::memory_order_release: + case std::memory_order_seq_cst: + case std::memory_order_acq_rel: + _mm_sfence(); + default:; + } +#endif +} + +static __forceinline void PostFence(std::memory_order order) { +#if ALWAYS_CONSERVATIVE + switch (order) { + case std::memory_order_seq_cst: + case std::memory_order_acq_rel: + case std::memory_order_acquire: + __atomic_thread_fence(__ATOMIC_SEQ_CST); + default:; + } +#elif X64_ORDER_WC + switch (order) { + case std::memory_order_seq_cst: + return _mm_mfence(); + case std::memory_order_acq_rel: + case std::memory_order_acquire: + return _mm_lfence(); + default:; + } +#endif +} + +static __forceinline void Fence(std::memory_order order=std::memory_order_seq_cst) { +#if ALWAYS_CONSERVATIVE + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif X64_ORDER_WC + switch (order) { + case std::memory_order_seq_cst: + case std::memory_order_acq_rel: + return _mm_mfence(); + case std::memory_order_acquire: + return _mm_lfence(); + case std::memory_order_release: + return _mm_sfence(); + default:; + } +#else + std::atomic_thread_fence(order); +#endif +} + +template +static __forceinline void BasicCheck(const T* ptr) { + constexpr bool value = __atomic_always_lock_free(sizeof(T), 0); + static_assert(value, "Atomic type may not be compatible with peripheral atomics."); +}; + +template +static __forceinline void BasicCheck(const volatile T* ptr) { + constexpr bool value = __atomic_always_lock_free(sizeof(T), 0); + static_assert(value, "Atomic type may not be compatible with peripheral atomics."); +}; + +/// @brief: Load value of type T atomically with specified memory order. +/// @param: ptr(Input), a pointer to type T. +/// @param: order(Input), memory order with atomic load, relaxed by default. +/// @return: T, loaded value. +template +static __forceinline T + Load(const T* ptr, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + T ret; + PreFence(order); + __atomic_load(ptr, &ret, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to volatile type T. +/// @param: order(Input), memory order with atomic load, relaxed by default. +/// @return: T, loaded value. +template +static __forceinline T + Load(const volatile T* ptr, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + T ret; + PreFence(order); + __atomic_load(ptr, &ret, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Store value of type T with specified memory order. +/// @param: ptr(Input), a pointer to instance which will be stored. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order with atomic store, relaxed by default. +/// @return: void. +template +static __forceinline void Store( + T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + __atomic_store(ptr, &val, c11ToBuiltInFlags(order)); + PostFence(order); +} + +/// @brief: Function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to volatile instance which will be stored. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order with atomic store, relaxed by default. +/// @return: void. +template +static __forceinline void Store( + volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + __atomic_store(ptr, &val, c11ToBuiltInFlags(order)); + PostFence(order); +} + +/// @brief: Compare and swap value atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be stored if condition is satisfied. +/// @param: expected(Input), value which is expected. +/// @param: order(Input), memory order with atomic operation. +/// @return: T, observed value of type T. +template +static __forceinline T + Cas(T* ptr, T val, T expected, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + __atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED); + PostFence(order); + return expected; +} + +/// @brief: Function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value to be stored if condition is satisfied. +/// @param: expected(Input), value which is expected. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, observed value of type T. +template +static __forceinline T + Cas(volatile T* ptr, T val, T expected, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + __atomic_compare_exchange(ptr, &expected, &val, false, c11ToBuiltInFlags(order), __ATOMIC_RELAXED); + PostFence(order); + return expected; +} + +/// @brief: Exchange the value atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value prior to the exchange. +template +static __forceinline T + Exchange(T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + T ret; + PreFence(order); + __atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Function overloading, for more info, see previous one. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be stored. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value prior to the exchange. +template +static __forceinline T + Exchange(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + T ret; + PreFence(order); + __atomic_exchange(ptr, &val, &ret, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Add value to variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be added. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value of the variable prior to the addition. +template +static __forceinline T + Add(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Subtract value from the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value to be subtraced. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of the variable prior to the subtraction. +template +static __forceinline T + Sub(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit And operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value which is ANDed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + And(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit Or operation on variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value which is ORed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Or(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit Xor operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: val(Input), value which is XORed with variable. +/// @order: order(Input), memory order which is relaxed by default. +/// @return: T, valud of variable prior to the opertaion. +template +static __forceinline T + Xor(T* ptr, T val, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Increase the value of variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Increment(T* ptr, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Decrease the value of the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Decrement(T* ptr, std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Add value to variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value to be added. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, the value of the variable prior to the addition. +template +static __forceinline T + Add(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_add(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Subtract value from the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value to be subtraced. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of the variable prior to the subtraction. +template +static __forceinline T + Sub(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_sub(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit And operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value which is ANDed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + And(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_and(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit Or operation on variable atomically with specified memory order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value which is ORed with variable. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T Or(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_or(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Bit Xor operation on variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: val(Input), value which is XORed with variable. +/// @order: order(Input), memory order which is relaxed by default. +/// @return: T, valud of variable prior to the opertaion. +template +static __forceinline T + Xor(volatile T* ptr, T val, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_xor(ptr, val, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Increase the value of variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Increment(volatile T* ptr, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_add(ptr, 1, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} + +/// @brief: Decrease the value of the variable atomically with specified memory +/// order. +/// @param: ptr(Input), a pointer to volatile variable which is operated on. +/// @param: order(Input), memory order which is relaxed by default. +/// @return: T, value of variable prior to the operation. +template +static __forceinline T + Decrement(volatile T* ptr, + std::memory_order order = std::memory_order_relaxed) { + BasicCheck(ptr); + PreFence(order); + T ret = __atomic_fetch_sub(ptr, 1, c11ToBuiltInFlags(order)); + PostFence(order); + return ret; +} +} // namespace atomic +} // namespace wsl + +#ifdef X64_ORDER_WC +#undef X64_ORDER_WC +#endif + +#ifdef ALWAYS_CONSERVATIVE +#undef ALWAYS_CONSERVATIVE +#endif + +#endif // HSA_RUNTIME_CORE_UTIL_ATOMIC_HELPERS_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/lazy_ptr.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/lazy_ptr.h new file mode 100644 index 0000000000..b5817af40d --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/lazy_ptr.h @@ -0,0 +1,155 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ +#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ + +#include +#include +#include + +#include "core/util/locks.h" +#include "core/util/utils.h" + +namespace wsl { + +/* + * Wrapper for a std::unique_ptr that initializes its object at first use. + */ +template class lazy_ptr { + public: + lazy_ptr() {} + + explicit lazy_ptr(std::function Constructor) { reset(Constructor); } + + lazy_ptr(lazy_ptr&& rhs) { + obj = std::move(rhs.obj); + func = std::move(rhs.func); + } + + lazy_ptr& operator=(lazy_ptr&& rhs) { + obj = std::move(rhs.obj); + func = std::move(rhs.func); + } + + lazy_ptr(lazy_ptr&) = delete; + lazy_ptr& operator=(lazy_ptr&) = delete; + + void reset(std::function Constructor = nullptr) { + obj.reset(); + func = Constructor; + } + + void reset(T* ptr) { + obj.reset(ptr); + func = nullptr; + } + + bool operator==(T* rhs) const { return obj.get() == rhs; } + bool operator!=(T* rhs) const { return obj.get() != rhs; } + + const std::unique_ptr& operator->() const { + make(true); + assert(obj != nullptr && "Null dereference through lazy_ptr."); + return obj; + } + + std::unique_ptr& operator*() { + make(true); + return obj; + } + + const std::unique_ptr& operator*() const { + make(true); + return obj; + } + + /* + * Ensures that the object is created or is being created. + * This is useful when early construction of the object is required. + */ + void touch() const { make(false); } + + // Tells if the lazy object has been constructed or not. + // Construction may fail silently (return nullptr). + bool created() const { + std::atomic_thread_fence(std::memory_order_acquire); + return func == nullptr; + } + + // Tells if the lazy object exists or not. + bool empty() const { + std::atomic_thread_fence(std::memory_order_acquire); + return obj == nullptr; + } + + private: + mutable std::unique_ptr obj; + mutable std::function func; + mutable KernelMutex lock; + + // Separated from make to improve inlining. + void make_body(bool block) const { + if (block) { + lock.Acquire(); + } else if (!lock.Try()) { + return; + } + MAKE_SCOPE_GUARD([&]() { lock.Release(); }); + if (func == nullptr) return; + T* ptr = func(); + obj.reset(ptr); + std::atomic_thread_fence(std::memory_order_release); + func = nullptr; + } + + __forceinline void make(bool block) const { + if (!created()) { + make_body(block); + } + } + +}; + +} // namespace wsl + +#endif // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/lnx/os_linux.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/lnx/os_linux.cpp new file mode 100644 index 0000000000..020ca10b28 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/lnx/os_linux.cpp @@ -0,0 +1,769 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifdef __linux__ +#include "core/util/os.h" +#include "core/util/utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "core/inc/runtime.h" +#if defined(__i386__) || defined(__x86_64__) +#include +#endif + +namespace wsl { +namespace os { + +struct ThreadArgs { + void* entry_args; + ThreadEntry entry_function; +}; + +void* __stdcall ThreadTrampoline(void* arg) { + ThreadArgs* ar = (ThreadArgs*)arg; + ThreadEntry CallMe = ar->entry_function; + void* Data = ar->entry_args; + delete ar; + CallMe(Data); + return nullptr; +} + +// Thread container allows multiple waits and separate close (destroy). +class os_thread { + public: + explicit os_thread(ThreadEntry function, void* threadArgument, uint stackSize) + : thread(0), lock(nullptr), state(RUNNING) { + int err; + std::unique_ptr args(new ThreadArgs); + lock = CreateMutex(); + if (lock == nullptr) return; + + args->entry_args = threadArgument; + args->entry_function = function; + + pthread_attr_t attrib; + err = pthread_attr_init(&attrib); + if (err != 0) { + pr_err("pthread_attr_init failed: %s\n", strerror(err)); + return; + } + + if (stackSize != 0) { + stackSize = Max(uint(PTHREAD_STACK_MIN), stackSize); + stackSize = AlignUp(stackSize, 4096); + err = pthread_attr_setstacksize(&attrib, stackSize); + if (err != 0) { + pr_err("pthread_attr_setstacksize failed: %s\n", strerror(err)); + err = pthread_attr_destroy(&attrib); + if (err != 0) { + pr_err("pthread_attr_destroy failed: %s\n", strerror(err)); + return; + } + } + } + + int cores = 0; + cpu_set_t* cpuset = nullptr; + + if (core::Runtime::runtime_singleton_->flag().override_cpu_affinity()) { + cores = get_nprocs_conf(); + cpuset = CPU_ALLOC(cores); + if (cpuset == nullptr) { + pr_err("CPU_ALLOC failed: %s\n", strerror(errno)); + return; + } + CPU_ZERO_S(CPU_ALLOC_SIZE(cores), cpuset); + for (int i = 0; i < cores; i++) { + CPU_SET_S(i, CPU_ALLOC_SIZE(cores), cpuset); + } + err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset); + CPU_FREE(cpuset); + if (err != 0) { + pr_err("pthread_setaffinity_np failed: %s\n", strerror(err)); + return; + } + } + + err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get()); + + // Probably a stack size error since system limits can be different from PTHREAD_STACK_MIN + // Attempt to grow the stack within reason. + if ((err == EINVAL) && stackSize != 0) { + while (stackSize < 20 * 1024 * 1024) { + stackSize *= 2; + err = pthread_attr_setstacksize(&attrib, stackSize); + if (err != 0) { + pr_err("pthread_attr_setstacksize failed: %s\n", strerror(err)); + return; + } + err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get()); + if (err != EINVAL) break; + pr_debug("pthread_create returned EINVAL, doubling stack size\n"); + } + } + + if (err == 0) + args.release(); + else + thread = 0; + + err = pthread_attr_destroy(&attrib); + if (err != 0) { + pr_err("pthread_attr_destroy failed: %s\n", strerror(err)); + } + } + + os_thread(os_thread&& rhs) { + thread = rhs.thread; + lock = rhs.lock; + state = int(rhs.state); + rhs.thread = 0; + rhs.lock = nullptr; + } + + os_thread(os_thread&) = delete; + + ~os_thread() { + if (lock != nullptr) DestroyMutex(lock); + if ((state == RUNNING) && (thread != 0)) { + int err = pthread_detach(thread); + if (err != 0) pr_err("pthread_detach failed: %s\n", strerror(err)); + } + } + + bool Valid() { return (lock != nullptr) && (thread != 0); } + + bool Wait() { + if (state == FINISHED) return true; + AcquireMutex(lock); + if (state == FINISHED) { + ReleaseMutex(lock); + return true; + } + int err = pthread_join(thread, NULL); + bool success = (err == 0); + if (success) state = FINISHED; + ReleaseMutex(lock); + return success; + } + + private: + pthread_t thread; + Mutex lock; + std::atomic state; + enum { FINISHED = 0, RUNNING = 1 }; +}; + +static_assert(sizeof(LibHandle) == sizeof(void*), "OS abstraction size mismatch"); +static_assert(sizeof(Semaphore) == sizeof(sem_t*), "OS abstraction size mismatch"); +static_assert(sizeof(Mutex) == sizeof(pthread_mutex_t*), "OS abstraction size mismatch"); +static_assert(sizeof(SharedMutex) == sizeof(pthread_rwlock_t*), "OS abstraction size mismatch"); +static_assert(sizeof(Thread) == sizeof(os_thread*), "OS abstraction size mismatch"); + +LibHandle LoadLib(std::string filename) { + void* ret = dlopen(filename.c_str(), RTLD_LAZY); + if (ret == nullptr) pr_err("LoadLib(%s) failed: %s\n", filename.c_str(), dlerror()); + return *(LibHandle*)&ret; +} + +void* GetExportAddress(LibHandle lib, std::string export_name) { + void* ret = dlsym(*(void**)&lib, export_name.c_str()); + + // dlsym searches the given library and all the library's load dependencies. + // Remaining code limits symbol lookup to only the library handle given. + // This lookup pattern matches Windows. + if (ret == NULL) return ret; + + link_map* map; + int err = dlinfo(*(void**)&lib, RTLD_DI_LINKMAP, &map); + if (err == -1) { + pr_err("dlinfo failed: %s\n", dlerror()); + return nullptr; + } + + Dl_info info; + err = dladdr(ret, &info); + if (err == 0) { + pr_err("dladdr failed.\n"); + return nullptr; + } + + if (strcmp(info.dli_fname, map->l_name) == 0) return ret; + + return NULL; +} + +void CloseLib(LibHandle lib) { dlclose(*(void**)&lib); } + +/* + * @brief Look for a symbol called "HSA_AMD_TOOL_PRIORITY" across all loaded + * shared libraries, and if found, store the name of the library + * + * @param[in]: info A dl_phdr_info struct pointer, which contains information + * about library's load address, header, and name. + * + * @param[in]: size integer size of dl_phdr_info struct + * + * @param[out]: data copy of the data argument to dl_phdr_iterate call + * + * @retval:: Return 0 on Success. If callback returns a non-zero value, + * dl_iterate_phdr() will stop processing, even if there are unprocessed + * shared objects. + */ + +static int callback(struct dl_phdr_info* info, size_t size, void* data) { + std::vector* loadedToolsLib = (std::vector*)data; + assert(loadedToolsLib != nullptr); + /* + * Check if lib name is not empty and its not a "vdso.so" lib, + * The vDSO is a special shared object file that is built into the Linux kernel. + * It is not a regular shared library and thus does not have all the properties + * of regular shared libraries. The way the vDSO is loaded and organized in memory + * is different from regular shared libraries and it's not guaranteed that it + * will have a specific segment or section. Hence its skipped. + */ + + if ((info) && (info->dlpi_name[0] != '\0')) { + if (std::string(info->dlpi_name).find("vdso.so") != std::string::npos) return 0; + + /* + * Iterate through the program headers of the loaded lib and check for PT_DYNAMIC program + * header. If the PT_DYNAMIC program header is found, use dlpi_addr and dlpi_phdr members + * of dl_phdr_info struct to get the address of the dynamic section of the loaded + * library in memory + */ + + for (int i = 0; i < info->dlpi_phnum; i++) { + if (info->dlpi_phdr[i].p_type == PT_DYNAMIC) { + Elf64_Dyn* dyn_section = (Elf64_Dyn*)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr); + + char* strings = nullptr; + Elf64_Xword limit = 0; + + /* + * The dynamic section is searched for DT_STRTAB (address of string table), + * and DT_STRSZ (size of string table) + * DT_NULL - Marks the end of the _DYNAMIC array + */ + + for (int j = 0;; j++) { + if (dyn_section[j].d_tag == DT_NULL) break; + + if (dyn_section[j].d_tag == DT_STRTAB) strings = (char*)(dyn_section[j].d_un.d_ptr); + + if (dyn_section[j].d_tag == DT_STRSZ) limit = dyn_section[j].d_un.d_val; + } + + if (strings == nullptr) pr_debug("String table not found\n"); + + /* + * Hacky lookup, if string and symbol tables are found, + * iterate through the strings in string table and check if + * any string matches "HSA_AMD_TOOL_PRIORITY". + * If yes, then add the name of the library to the vector of + * lib names + */ + if (strings != nullptr) { + char* end = strings + limit; + while (strings < end) { + if (strcmp(strings, "HSA_AMD_TOOL_PRIORITY") == 0) { + loadedToolsLib->push_back(info->dlpi_name); + return 0; + } + strings += (strlen(strings) + 1); + } + } + } + } + } + return 0; +} + +std::vector GetLoadedToolsLib() { + std::vector ret; + std::vector names; + + /* Iterate through all of the loaded shared libraries in the process */ + dl_iterate_phdr(callback, &names); + + if (!names.empty()) { + for (auto& name : names) ret.push_back(LoadLib(name)); + } + + return ret; +} + +std::string GetLibraryName(LibHandle lib) { + link_map *map; + if(dlinfo(lib, RTLD_DI_LINKMAP, &map)!=0) + return ""; + return map->l_name; +} + +Semaphore CreateSemaphore() { + sem_t *sem = new sem_t; + sem_init(sem, 0, 0); + return *(Semaphore*)&sem; +} + +bool WaitSemaphore(Semaphore sem) { + while(sem_wait(*(sem_t**)&sem)) + if (errno != EINTR) return false; + + return true; +} + +void PostSemaphore(Semaphore sem) { + if (sem_post(*(sem_t**)&sem)) + assert(false && "Failed to post semaphore"); +} + +void DestroySemaphore(Semaphore sem) { + sem_destroy(*(sem_t**)&sem); + delete *(sem_t**)&sem; +} + +Mutex CreateMutex() { + pthread_mutex_t* mutex = new pthread_mutex_t; + pthread_mutex_init(mutex, NULL); + return *(Mutex*)&mutex; +} + +bool TryAcquireMutex(Mutex lock) { + return pthread_mutex_trylock(*(pthread_mutex_t**)&lock) == 0; +} + +bool AcquireMutex(Mutex lock) { + return pthread_mutex_lock(*(pthread_mutex_t**)&lock) == 0; +} + +void ReleaseMutex(Mutex lock) { + pthread_mutex_unlock(*(pthread_mutex_t**)&lock); +} + +void DestroyMutex(Mutex lock) { + pthread_mutex_destroy(*(pthread_mutex_t**)&lock); + delete *(pthread_mutex_t**)&lock; +} + +void Sleep(int delay_in_millisec) { usleep(delay_in_millisec * 1000); } + +void uSleep(int delayInUs) { usleep(delayInUs); } + +void YieldThread() { sched_yield(); } + +Thread CreateThread(ThreadEntry function, void* threadArgument, uint stackSize) { + os_thread* result = new os_thread(function, threadArgument, stackSize); + if (!result->Valid()) { + delete result; + return nullptr; + } + + return reinterpret_cast(result); +} + +void CloseThread(Thread thread) { delete reinterpret_cast(thread); } + +bool WaitForThread(Thread thread) { return reinterpret_cast(thread)->Wait(); } + +bool WaitForAllThreads(Thread* threads, uint threadCount) { + for (uint i = 0; i < threadCount; i++) WaitForThread(threads[i]); + return true; +} + +bool IsEnvVarSet(std::string env_var_name) { + char* buff = NULL; + buff = getenv(env_var_name.c_str()); + return (buff != NULL); +} + +void SetEnvVar(std::string env_var_name, std::string env_var_value) { + setenv(env_var_name.c_str(), env_var_value.c_str(), 1); +} + +int GetProcessId() { + return ::getpid(); +} + +std::string GetEnvVar(std::string env_var_name) { + char* buff; + buff = getenv(env_var_name.c_str()); + std::string ret; + if (buff) { + ret = buff; + } + return ret; +} + +size_t GetUserModeVirtualMemorySize() { +#ifdef _LP64 + // https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt : + // user space is 0000000000000000 - 00007fffffffffff (=47 bits) + return (size_t)(0x800000000000); +#else + return (size_t)(0xffffffff); // ~4GB +#endif +} + +size_t GetUsablePhysicalHostMemorySize() { + struct sysinfo info = {0}; + if (sysinfo(&info) != 0) { + return 0; + } + + const size_t physical_size = + static_cast(info.totalram * info.mem_unit); + return std::min(GetUserModeVirtualMemorySize(), physical_size); +} + +uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; } + +// Os event implementation +typedef struct EventDescriptor_ { + pthread_cond_t event; + pthread_mutex_t mutex; + bool state; + bool auto_reset; +} EventDescriptor; + +EventHandle CreateOsEvent(bool auto_reset, bool init_state) { + EventDescriptor* eventDescrp; + eventDescrp = (EventDescriptor*)malloc(sizeof(EventDescriptor)); + + pthread_mutex_init(&eventDescrp->mutex, NULL); + pthread_cond_init(&eventDescrp->event, NULL); + eventDescrp->auto_reset = auto_reset; + eventDescrp->state = init_state; + + EventHandle handle = reinterpret_cast(eventDescrp); + + return handle; +} + +int DestroyOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + int ret_code = pthread_cond_destroy(&eventDescrp->event); + ret_code |= pthread_mutex_destroy(&eventDescrp->mutex); + free(eventDescrp); + return ret_code; +} + +int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + // Event wait time is 0 and state is non-signaled, return directly + if (milli_seconds == 0) { + int tmp_ret = pthread_mutex_trylock(&eventDescrp->mutex); + if (tmp_ret == EBUSY) { + // Timeout + return 1; + } + } + + int ret_code = 0; + pthread_mutex_lock(&eventDescrp->mutex); + if (!eventDescrp->state) { + if (milli_seconds == 0) { + ret_code = 1; + } else { + struct timespec ts; + struct timeval tp; + + ret_code = gettimeofday(&tp, NULL); + ts.tv_sec = tp.tv_sec; + ts.tv_nsec = tp.tv_usec * 1000; + + unsigned int sec = milli_seconds / 1000; + unsigned int mSec = milli_seconds % 1000; + + ts.tv_sec += sec; + ts.tv_nsec += mSec * 1000000; + + // More then one second, add 1 sec to the tv_sec elem + if (ts.tv_nsec > 1000000000) { + ts.tv_sec += 1; + ts.tv_nsec = ts.tv_nsec - 1000000000; + } + + ret_code = + pthread_cond_timedwait(&eventDescrp->event, &eventDescrp->mutex, &ts); + // Time out + if (ret_code == 110) { + ret_code = 0x14003; // 1 means time out in HSA + } + + if (ret_code == 0 && eventDescrp->auto_reset) { + eventDescrp->state = false; + } + } + } else if (eventDescrp->auto_reset) { + eventDescrp->state = false; + } + pthread_mutex_unlock(&eventDescrp->mutex); + + return ret_code; +} + +int SetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + int ret_code = 0; + ret_code = pthread_mutex_lock(&eventDescrp->mutex); + eventDescrp->state = true; + ret_code = pthread_mutex_unlock(&eventDescrp->mutex); + ret_code |= pthread_cond_signal(&eventDescrp->event); + + return ret_code; +} + +int ResetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + + EventDescriptor* eventDescrp = reinterpret_cast(event); + int ret_code = 0; + ret_code = pthread_mutex_lock(&eventDescrp->mutex); + eventDescrp->state = false; + ret_code = pthread_mutex_unlock(&eventDescrp->mutex); + + return ret_code; +} + +static double invPeriod = 0.0; + +uint64_t ReadAccurateClock() { + if (invPeriod == 0.0) AccurateClockFrequency(); + timespec time; + int err = clock_gettime(CLOCK_MONOTONIC_RAW, &time); + if (err != 0) { + pr_err("clock_gettime(CLOCK_MONOTONIC_RAW,...) failed %s\n", strerror(errno)); + abort(); + } + return (uint64_t(time.tv_sec) * 1000000000ull + uint64_t(time.tv_nsec)) * invPeriod; +} + +uint64_t AccurateClockFrequency() { + static clockid_t clock = CLOCK_MONOTONIC; + static std::atomic first(true); + // Check kernel version - not a concurrency concern. + // use non-RAW for getres due to bug in older 2.6.x kernels + if (first.load(std::memory_order_acquire)) { + utsname kernelInfo; + if (uname(&kernelInfo) == 0) { + try { + std::string ver = kernelInfo.release; + size_t idx; + int major = std::stoi(ver, &idx); + int minor = std::stoi(ver.substr(idx + 1)); + if ((major >= 4) && (minor >= 4)) { + clock = CLOCK_MONOTONIC_RAW; + } + } catch (...) { + // Kernel version string doesn't conform to the standard pattern. + // Keep using the "safe" (non-RAW) clock. + } + } + first.store(false, std::memory_order_release); + } + timespec time; + int err = clock_getres(clock, &time); + if (err != 0) { + pr_err("clock_getres failed %s\n", strerror(errno)); + abort(); + } + if (time.tv_sec != 0 || time.tv_nsec >= 0xFFFFFFFF) { + pr_err("clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low frequency (<1Hz).\n"); + abort(); + } + if (invPeriod == 0.0) invPeriod = 1.0 / double(time.tv_nsec); + return 1000000000ull / uint64_t(time.tv_nsec); +} + +SharedMutex CreateSharedMutex() { + pthread_rwlockattr_t attrib; + int err = pthread_rwlockattr_init(&attrib); + if (err != 0) { + pr_err("rw lock attribute init failed: %s\n", strerror(err)); + return nullptr; + } + +#ifdef __GLIBC__ + err = pthread_rwlockattr_setkind_np(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); + if (err != 0) { + pr_err("Set rw lock attribute failure: %s\n", strerror(err)); + return nullptr; + } +#else + err = pthread_rwlockattr_setkind(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); + if (err != 0) { + pr_err("Set rw lock attribute failure: %s\n", strerror(err)); + return nullptr; + } +#endif + + pthread_rwlock_t* lock = new pthread_rwlock_t; + err = pthread_rwlock_init(lock, &attrib); + if (err != 0) { + pr_err("rw lock init failed: %s\n", strerror(err)); + return nullptr; + } + + pthread_rwlockattr_destroy(&attrib); + return lock; +} + +bool TryAcquireSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_trywrlock(*(pthread_rwlock_t**)&lock); + return err == 0; +} + +bool AcquireSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_wrlock(*(pthread_rwlock_t**)&lock); + return err == 0; +} + +void ReleaseSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock); + if (err != 0) { + pr_err("SharedMutex unlock failed: %s\n", strerror(err)); + abort(); + } +} + +bool TrySharedAcquireSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_tryrdlock(*(pthread_rwlock_t**)&lock); + return err == 0; +} + +bool SharedAcquireSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_rdlock(*(pthread_rwlock_t**)&lock); + return err == 0; +} + +void SharedReleaseSharedMutex(SharedMutex lock) { + int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock); + if (err != 0) { + pr_err("SharedMutex unlock failed: %s\n", strerror(err)); + abort(); + } +} + +void DestroySharedMutex(SharedMutex lock) { + pthread_rwlock_destroy(*(pthread_rwlock_t**)&lock); + delete *(pthread_rwlock_t**)&lock; +} + +static uint64_t sys_clock_period_ = 0; + +uint64_t ReadSystemClock() { + struct timespec ts; + clock_gettime(CLOCK_BOOTTIME, &ts); + uint64_t time = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec)); + if (sys_clock_period_ != 1) + return time / sys_clock_period_; + else + return time; +} + +uint64_t SystemClockFrequency() { + struct timespec ts; + clock_getres(CLOCK_BOOTTIME, &ts); + sys_clock_period_ = (uint64_t(ts.tv_sec) * 1000000000 + uint64_t(ts.tv_nsec)); + return 1000000000 / sys_clock_period_; +} + +bool ParseCpuID(cpuid_t* cpuinfo) { +#if defined(__i386__) || defined(__x86_64__) + uint32_t eax, ebx, ecx, edx, max_eax = 0; + memset(cpuinfo, 0, sizeof(*cpuinfo)); + + /* Make sure current CPU supports at least EAX 4 */ + if (!__get_cpuid_max(0x80000004, NULL)) return false; + + // Manufacturer ID is a twelve-character ASCII string stored in order EBX, EDX, ECX. + if (!__get_cpuid(0, &max_eax, (uint32_t*)&cpuinfo->ManufacturerID[0], + (uint32_t*)&cpuinfo->ManufacturerID[8], + (uint32_t*)&cpuinfo->ManufacturerID[4])) { + return false; + } + + if (!strcmp(cpuinfo->ManufacturerID, "AuthenticAMD")) { + if (__get_cpuid(0x80000001, &eax, &ebx, &ecx, &edx)) { + cpuinfo->mwaitx = !!((ecx >> 29) & 0x1); + } + } + return true; +#else + return false; +#endif +} + +} // namespace os +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/locks.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/locks.h new file mode 100644 index 0000000000..a17fa09593 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/locks.h @@ -0,0 +1,290 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Library of syncronization primitives - to be added to as needed. + +#ifndef HSA_RUNTIME_CORE_UTIL_LOCKS_H_ +#define HSA_RUNTIME_CORE_UTIL_LOCKS_H_ + +#include "utils.h" +#include "os.h" + +namespace wsl { + +class HybridMutex { + public: + HybridMutex():lock_(0) { + sem_ = os::CreateSemaphore(); + } + + ~HybridMutex() { + os::DestroySemaphore(sem_); + } + + bool Try() { + int old = 0; + return lock_.compare_exchange_strong(old, 1); + } + + bool Acquire() { + int cnt = maxSpinIterPause + maxSpinIterYield; + + int old = 0; + while (!lock_.compare_exchange_strong(old, 1)) { + cnt--; + if (cnt > maxSpinIterPause) { + _mm_pause(); + } else if (cnt-- > maxSpinIterYield) { + os::YieldThread(); + } else { + os::WaitSemaphore(sem_); + cnt = maxSpinIterPause + maxSpinIterYield; + } + old = 0; + } + return true; + } + + void Release() { + int old = 1; + if (lock_.compare_exchange_strong(old, 0)) + os::PostSemaphore(sem_); + } + + private: + std::atomic lock_; + os::Semaphore sem_; + const uint32_t maxSpinIterPause = 55; + const uint32_t maxSpinIterYield = 55; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(HybridMutex); +}; + + +/// @brief: a class represents a kernel mutex. +/// Uses the kernel's scheduler to keep the waiting thread from being scheduled +/// until the lock is released (Best for long waits, though anything using +/// a kernel object is a long wait). +class KernelMutex { + public: + KernelMutex() { lock_ = os::CreateMutex(); } + ~KernelMutex() { os::DestroyMutex(lock_); } + + bool Try() { return os::TryAcquireMutex(lock_); } + bool Acquire() { return os::AcquireMutex(lock_); } + void Release() { os::ReleaseMutex(lock_); } + + private: + os::Mutex lock_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(KernelMutex); +}; + +/// @brief: represents a spin lock. +/// For very short hold durations on the order of the thread scheduling +/// quanta or less. +class SpinMutex { + public: + SpinMutex() { lock_ = 0; } + + bool Try() { + int old = 0; + return lock_.compare_exchange_strong(old, 1); + } + bool Acquire() { + int old = 0; + while (!lock_.compare_exchange_strong(old, 1)) + { + old=0; + os::YieldThread(); + } + return true; + } + void Release() { lock_ = 0; } + + private: + std::atomic lock_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(SpinMutex); +}; + +class KernelEvent { + public: + KernelEvent() { evt_ = os::CreateOsEvent(true, true); } + ~KernelEvent() { os::DestroyOsEvent(evt_); } + + bool IsSet() { return os::WaitForOsEvent(evt_, 0)==0; } + bool WaitForSet() { return os::WaitForOsEvent(evt_, 0xFFFFFFFF)==0; } + void Set() { os::SetOsEvent(evt_); } + void Reset() { os::ResetOsEvent(evt_); } + + private: + os::EventHandle evt_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(KernelEvent); +}; + +/// @brief: represents a yielding shared mutex. +/// aka read/write mutex +class KernelSharedMutex { + public: + /// @brief: Interfaces ScopedAcquire to shared operations. + class Shared { + public: + explicit Shared(KernelSharedMutex* lock) : lock_(lock) {} + bool Try() { return lock_->TryShared(); } + bool Acquire() { return lock_->AcquireShared(); } + void Release() { lock_->ReleaseShared(); } + + private: + KernelSharedMutex* lock_; + }; + + KernelSharedMutex() { lock_ = os::CreateSharedMutex(); } + ~KernelSharedMutex() { os::DestroySharedMutex(lock_); } + + // Exclusive mode operations + bool Try() { return os::TryAcquireSharedMutex(lock_); } + bool Acquire() { return os::AcquireSharedMutex(lock_); } + void Release() { os::ReleaseSharedMutex(lock_); } + + // Shared mode operations + bool TryShared() { return os::TrySharedAcquireSharedMutex(lock_); } + bool AcquireShared() { return os::SharedAcquireSharedMutex(lock_); } + void ReleaseShared() { os::SharedReleaseSharedMutex(lock_); } + + // Return shared operations interface + Shared shared() { return Shared(this); } + + private: + os::SharedMutex lock_; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(KernelSharedMutex); +}; + +/// @brief: Type trait to identify mutex types +template class isMutex { + public: + enum { value = false }; +}; +template <> class isMutex { + public: + enum { value = true }; +}; +template <> class isMutex { + public: + enum { value = true }; +}; +template <> class isMutex { + public: + enum { value = true }; +}; +template <> class isMutex { + public: + enum { value = true }; +}; + +/// @brief: A class behaves as a lock in a scope. When trying to enter into the +/// critical section, creat a object of this class. After the control path goes +/// out of the scope, it will release the lock automatically. +template class ScopedAcquire { + public: + /// @brief: When constructing, acquire the lock. + /// @param: lock(Input), pointer to an existing lock. + explicit ScopedAcquire(LockType* lock) : lock_(lock), doRelease(true) { + static_assert(isMutex::value, "ScopedAcquire requires a mutex type."); + lock_.Acquire(); + } + explicit ScopedAcquire(LockType lock) : lock_(lock), doRelease(true) { + static_assert(!isMutex::value, "Mutex types are not copyable."); + lock_.Acquire(); + } + + /// @brief: when destructing, release the lock. + ~ScopedAcquire() { + if (doRelease) lock_.Release(); + } + + /// @brief: Release the lock early. Avoid using when possible. + void Release() { + lock_.Release(); + doRelease = false; + } + + private: + /// @brief: Adapts between pointers to mutex types and mutex pointer types. + template class container { + public: + container(T* lock) : lock_(lock) {} + __forceinline bool Acquire() { return lock_->Acquire(); } + __forceinline void Release() { return lock_->Release(); } + + private: + T* lock_; + }; + + /// @brief: Specialization for mutex pointer types. + template class container { + public: + container(T lock) : lock_(lock) {} + __forceinline bool Acquire() { return lock_.Acquire(); } + __forceinline void Release() { return lock_.Release(); } + + private: + T lock_; + }; + + container::value> lock_; + bool doRelease; + + /// @brief: Disable copiable and assignable ability. + DISALLOW_COPY_AND_ASSIGN(ScopedAcquire); +}; + +} // namespace wsl + +#endif // HSA_RUNTIME_CORE_SUTIL_LOCKS_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/os.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/os.h new file mode 100644 index 0000000000..2f40cd1581 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/os.h @@ -0,0 +1,327 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Minimal operating system abstraction interfaces. + +#ifndef HSA_RUNTIME_CORE_UTIL_OS_H_ +#define HSA_RUNTIME_CORE_UTIL_OS_H_ + +#include +#include +#include "utils.h" + +namespace wsl { +namespace os { +typedef void* LibHandle; +typedef void* Semaphore; +typedef void* Mutex; +typedef void* SharedMutex; +typedef void* Thread; +typedef void* EventHandle; + +enum class os_t { OS_WIN = 0, OS_LINUX, COUNT }; +static __forceinline std::underlying_type::type os_index(os_t val) { + return std::underlying_type::type(val); +} + +#ifdef _WIN32 +static const os_t current_os = os_t::OS_WIN; +#elif __linux__ +static const os_t current_os = os_t::OS_LINUX; +#else +static_assert(false, "Operating System not detected!"); +#endif + +/// @brief: Loads dynamic library based on file name. Return value will be NULL +/// if failed. +/// @param: filename(Input), file name of the library. +/// @return: LibHandle. +LibHandle LoadLib(std::string filename); + +/// @brief: Gets the address of exported symbol. Return NULl if failed. +/// @param: lib(Input), library handle which exporting from. +/// @param: export_name(Input), the name of the exported symbol. +/// @return: void*. +void* GetExportAddress(LibHandle lib, std::string export_name); + +/// @brief: Unloads the dynamic library. +/// @param: lib(Input), library handle which will be unloaded. +void CloseLib(LibHandle lib); + +/// @brief: Lists loaded tool libraries that contain +/// symbol HSA_AMD_TOOL_PRIORITY +/// @return: List of library handles +std::vector GetLoadedToolsLib(); + +/// @brief: Returns the library's path name. +/// @param: lib(Input), libray handle +/// @return: Path name of library +std::string GetLibraryName(LibHandle lib); + +/// @brief: Creates a Semaphore, will return NULL if failed. +/// @param: void. +/// @return: Semaphore. +Semaphore CreateSemaphore(); + +/// @brief: Waits for the semaphore. This is a blocking wait. +/// If the Semaphore is signalled, this function will return. +/// @param: sem(Input), handle to the semaphore. +/// @return: void. +bool WaitSemaphore(Semaphore sem); + +/// @brief: Post/Signal/Wake-up the semaphore +/// @param: sem(Input), handle to the semaphore. +/// @return: void. +void PostSemaphore(Semaphore sem); + +/// @brief: Destroys the semaphore. +/// @param: sem(Input), handle to the semaphore. +/// @return: void. +void DestroySemaphore(Semaphore sem); + +/// @brief: Creates a mutex, will return NULL if failed. +/// @param: void. +/// @return: Mutex. +Mutex CreateMutex(); + +/// @brief: Tries to acquire the mutex once, if successed, return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool TryAcquireMutex(Mutex lock); + +/// @brief: Aquires the mutex, if the mutex is locked, it will wait until it is +/// released. If the mutex is acquired successfully, it will return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool AcquireMutex(Mutex lock); + +/// @brief: Releases the mutex. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void ReleaseMutex(Mutex lock); + +/// @brief: Destroys the mutex. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void DestroyMutex(Mutex lock); + +/// @brief: Creates a shared mutex, will return NULL if failed. +/// @param: void. +/// @return: SharedMutex. +SharedMutex CreateSharedMutex(); + +/// @brief: Tries to acquire the mutex in exclusive mode once, if successed, return true. +/// @param: lock(Input), handle to the shared mutex. +/// @return: bool. +bool TryAcquireSharedMutex(SharedMutex lock); + +/// @brief: Aquires the mutex in exclusive mode, if the mutex is locked, it will wait until it is +/// released. If the mutex is acquired successfully, it will return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool AcquireSharedMutex(SharedMutex lock); + +/// @brief: Releases the mutex from exclusive mode. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void ReleaseSharedMutex(SharedMutex lock); + +/// @brief: Tries to acquire the mutex in shared mode once, if successed, return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool TrySharedAcquireSharedMutex(SharedMutex lock); + +/// @brief: Aquires the mutex in shared mode, if the mutex in exclusive mode, it will wait until it +/// is released. If the mutex is acquired successfully, it will return true. +/// @param: lock(Input), handle to the mutex. +/// @return: bool. +bool SharedAcquireSharedMutex(SharedMutex lock); + +/// @brief: Releases the mutex from shared mode. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void SharedReleaseSharedMutex(SharedMutex lock); + +/// @brief: Destroys the mutex. +/// @param: lock(Input), handle to the mutex. +/// @return: void. +void DestroySharedMutex(SharedMutex lock); + +/// @brief: Puts current thread to sleep. +/// @param: delayInMs(Input), time in millisecond for sleeping. +/// @return: void. +void Sleep(int delayInMs); + +/// @brief: Puts current thread to sleep. +/// @param: delayInMs(Input), time in millisecond for sleeping. +/// @return: void. +void uSleep(int delayInUs); + +/// @brief: Yields current thread. +/// @param: void. +/// @return: void. +void YieldThread(); + +typedef void (*ThreadEntry)(void*); + +/// @brief: Creates a thread will return NULL if failed. +/// @param: entry_function(Input), a pointer to the function which the thread +/// starts from. +/// @param: entry_argument(Input), a pointer to the argument of the thread +/// function. +/// @param: stack_size(Input), size of the thread's stack, 0 by default. +/// @return: Thread, a handle to thread created. +Thread CreateThread(ThreadEntry entry_function, void* entry_argument, + uint stack_size = 0); + +/// @brief: Destroys the thread. +/// @param: thread(Input), thread handle to what will be destroyed. +/// @return: void. +void CloseThread(Thread thread); + +/// @brief: Waits for specific thread to finish, if successful, return true. +/// @param: thread(Input), handle to waiting thread. +/// @return: bool. +bool WaitForThread(Thread thread); + +/// @brief: Waits for multiple threads to finish, if successful, return true. +/// @param; threads(Input), a pointer to a list of thread handle. +/// @param: thread_count(Input), number of threads to be waited on. +/// @return: bool. +bool WaitForAllThreads(Thread* threads, uint thread_count); + +/// @brief: Determines if environment key is set. +/// @param: env_var_name(Input), name of the environment value. +/// @return: bool, true for binding any value to environment key, +/// including an empty string. False otherwise +bool IsEnvVarSet(std::string env_var_name); + +/// @brief: Sets the environment value. +/// @param: env_var_name(Input), name of the environment value. +/// @param: env_var_value(Input), value of the environment value.s +/// @return: void. +void SetEnvVar(std::string env_var_name, std::string env_var_value); + +/// @brief: Gets the value of environment value. +/// @param: env_var_name(Input), name of the environment value. +/// @return: std::string, value of the environment value, returned as string. +std::string GetEnvVar(std::string env_var_name); + +/// @brief: Gets the process ID. +/// @param: void +/// @return: int, process ID returned as int. +int GetProcessId(); + +/// @brief: Gets the max virtual memory size accessible to the application. +/// @param: void. +/// @return: size_t, size of the accessible memory to the application. +size_t GetUserModeVirtualMemorySize(); + +/// @brief: Gets the max physical host system memory size. +/// @param: void. +/// @return: size_t, size of the physical host system memory. +size_t GetUsablePhysicalHostMemorySize(); + +/// @brief: Gets the virtual memory base address. It is hardcoded to 0. +/// @param: void. +/// @return: uintptr_t, always 0. +uintptr_t GetUserModeVirtualMemoryBase(); + +/// @brief os event api, create an event +/// @param: auto_reset whether an event can reset the status automatically +/// @param: init_state initial state of the event +/// @return: event handle +EventHandle CreateOsEvent(bool auto_reset, bool init_state); + +/// @brief os event api, destroy an event +/// @param: event handle +/// @return: whether destroy is correct +int DestroyOsEvent(EventHandle event); + +/// @brief os event api, wait on event +/// @param: event Event handle +/// @param: milli_seconds wait time +/// @return: Indicate success or timeout +int WaitForOsEvent(EventHandle event, unsigned int milli_seconds); + +/// @brief os event api, set event state +/// @param: event Event handle +/// @return: Whether event set is correct +int SetOsEvent(EventHandle event); + +/// @brief os event api, reset event state +/// @param: event Event handle +/// @return: Whether event reset is correct +int ResetOsEvent(EventHandle event); + +/// @brief reads a clock which is deemed to be accurate for elapsed time +/// measurements, though not necessarilly fast to query +/// @return clock counter value +uint64_t ReadAccurateClock(); + +/// @brief retrieves the frequency in Hz of the unit used in ReadAccurateClock. +/// It does not necessarilly reflect the resolution of the clock, but is the +/// value needed to convert a difference in the clock's counter value to elapsed +/// seconds. This frequency does not change at runtime. +/// @return returns the frequency +uint64_t AccurateClockFrequency(); + +/// @brief read the system clock which serves as the HSA system clock +/// counter in KFD. +uint64_t ReadSystemClock(); + +/// @brief read the system clock frequency +uint64_t SystemClockFrequency(); + +typedef struct cpuid_s { + char ManufacturerID[13]; // 12 char, NULL terminated + bool mwaitx; +} cpuid_t; + +/// @brief parse CPUID +/// @param: cpuinfo struct to be filled +bool ParseCpuID(cpuid_t* cpuinfo); + +} // namespace os +} // namespace wsl + +#endif // HSA_RUNTIME_CORE_UTIL_OS_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/simple_heap.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/simple_heap.h new file mode 100644 index 0000000000..1fb992eb63 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/simple_heap.h @@ -0,0 +1,394 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// A simple best fit memory allocator with eager compaction. Manages block sub-allocation. +// For use when memory efficiency is more important than allocation speed. +// O(log n) time. + +#ifndef HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_ +#define HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_ + +#include +#include +#include + + +namespace wsl { + +template class SimpleHeap { + private: + struct Fragment_T { + typedef std::multimap::iterator ptr_t; + ptr_t free_list_entry_; + struct { + size_t size : 62; + bool discard : 1; + bool free : 1; + }; + + Fragment_T(ptr_t Iterator, size_t Len, bool Free) + : free_list_entry_(Iterator), size(Len), discard(false), free(Free) {} + Fragment_T() = default; + }; + + struct Block { + uintptr_t base_ptr_; + size_t length_; + + Block(uintptr_t base, size_t length) : base_ptr_(base), length_(length) {} + Block() = default; + }; + + Allocator block_allocator_; + + std::multimap free_list_; + std::map> block_list_; + std::deque block_cache_; + + // Size of blocks that are at least partially in use. + size_t in_use_size_; + // Total size of block cache + size_t cache_size_; + + __forceinline bool isFree(const Fragment_T& node) { return node.free; } + __forceinline void setUsed(Fragment_T& node) { + node.free = false; + node.free_list_entry_ = free_list_.end(); + } + __forceinline void setFree(Fragment_T& node, typename Fragment_T::ptr_t Iterator) { + node.free_list_entry_ = Iterator; + node.free = true; + } + __forceinline Fragment_T makeFragment(size_t Len) { + return Fragment_T(free_list_.end(), Len, false); + } + __forceinline Fragment_T makeFragment(typename Fragment_T::ptr_t Iterator, size_t Len) { + return Fragment_T(Iterator, Len, true); + } + __forceinline void removeFreeListEntry(Fragment_T& node) { + if (node.free_list_entry_ != free_list_.end()) { + free_list_.erase(node.free_list_entry_); + node.free_list_entry_ = free_list_.end(); + } + } + __forceinline void discard(Fragment_T& node) { + removeFreeListEntry(node); + node.discard = true; + } + + public: + explicit SimpleHeap(const Allocator& BlockAllocator = Allocator()) + : block_allocator_(BlockAllocator), in_use_size_(0), cache_size_(0) {} + ~SimpleHeap() { + trim(); + // Leak here may be due to the user. Check is for debugging only. + // assert(in_use_size_ == 0 && "Leak in SimpleHeap."); + } + + SimpleHeap(const SimpleHeap& rhs) = delete; + SimpleHeap(SimpleHeap&& rhs) = delete; + SimpleHeap& operator=(const SimpleHeap& rhs) = delete; + SimpleHeap& operator=(SimpleHeap&& rhs) = delete; + + void* alloc(size_t bytes) { + // Find best fit. + uintptr_t base; + size_t size; + // For bytes >= 2MB, the requested mem should be aligned + size_t align_bytes = bytes; + const int retry = bytes >= GPU_HUGE_PAGE_SIZE ? 1 : 0; + size_t align = bytes >= GPU_HUGE_PAGE_SIZE ? GPU_HUGE_PAGE_SIZE : DEFAULT_GPU_PAGE_SIZE; + + for (int i = 0; i <= retry; i++) { + auto free_fragment = free_list_.lower_bound(align_bytes); + if (free_fragment == free_list_.end()) break; + + uintptr_t addr = free_fragment->second; + size = free_fragment->first; + + assert(size >= bytes && "SimpleHeap: map lower_bound failure."); + + // Find the containing block and fragment + auto it = block_list_.upper_bound(addr); + it--; + auto& frag_map = it->second; + const auto& fragment = frag_map.find(addr); + + assert(fragment != frag_map.end() && "Inconsistency in SimpleHeap."); + assert(size == fragment->second.size && "Inconsistency in SimpleHeap."); + + size_t delta = addr & (align - 1); + if (!delta) { + // already find aligned address + base = addr; + free_list_.erase(free_fragment); + // Sub-allocate from fragment. + fragment->second.size = bytes; + setUsed(fragment->second); + // Record remaining free space. + if (size > bytes) { + free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes)); + frag_map[base + bytes] = makeFragment(free_fragment, size - bytes); + } + } else { + // If this is the first request and the requested size is not enough for alignment, + // then request for a bigger hole and do trim. + if (i == 0 && size < bytes + align - delta) { + align_bytes += align; + continue; + } + + uintptr_t aligned_base = addr + align - delta; + base = aligned_base; + + // Erase the old free list + free_list_.erase(free_fragment); + + // fragment 1 - free + free_fragment = free_list_.insert(std::make_pair(aligned_base - addr, addr)); + frag_map[addr] = makeFragment(free_fragment, aligned_base - addr); + + //fragment 2 - used + frag_map[base] = makeFragment(bytes); + + // fragement 3 - free + if (size > aligned_base - addr + bytes) { + free_fragment = free_list_.insert(std::make_pair(size - (aligned_base - addr) - bytes, aligned_base + bytes)); + frag_map[aligned_base + bytes] = makeFragment(free_fragment, size - (aligned_base - addr) - bytes); + } + } + return reinterpret_cast(base); + } + + // No usable fragment, check block cache + if (bytes < default_block_size() && !block_cache_.empty()) { + const auto& block = block_cache_.back(); + base = block.base_ptr_; + size = block.length_; + block_cache_.pop_back(); + cache_size_ -= size; + } else { // Alloc new block - new block may be larger than default. + void* ptr = block_allocator_.alloc(bytes, size); + if (ptr == nullptr) { + fprintf(stderr, "Block allocation failed, Allocator is expected to throw.\n"); + return nullptr; + } + base = reinterpret_cast(ptr); + } + + in_use_size_ += size; + assert(size >= bytes && "Alloc exceeds block size."); + // Sub alloc and insert free region. + if (size > bytes) { + auto free_fragment = free_list_.insert(std::make_pair(size - bytes, base + bytes)); + block_list_[base][base + bytes] = makeFragment(free_fragment, size - bytes); + } + // Track used region + block_list_[base][base] = makeFragment(bytes); + + // Disallow multiple suballocation from large blocks. + // Prevents a small allocation from retaining a large block. + if (bytes > default_block_size()) { + bool err = discardBlock(reinterpret_cast(base)); + assert(err && "Large block discard failed."); + } + + return reinterpret_cast(base); + } + + /* Return block-base the ptr belongs to if the ptr is a valid ptr which is allocated + * from this simpleheap and the block-base is allocated from block_allocator_*/ + void* block_base(void* ptr) { + if (ptr == nullptr) + return nullptr; + + uintptr_t base = reinterpret_cast(ptr); + + // Find fragment and validate. + auto frag_map_it = block_list_.upper_bound(base); + if (frag_map_it == block_list_.begin()) + return nullptr; + frag_map_it--; + auto& frag_map = frag_map_it->second; + auto fragment = frag_map.find(base); + if (fragment == frag_map.end() || isFree(fragment->second)) + return nullptr; + + return reinterpret_cast(frag_map_it->first); + } + + void reset() { + free_list_.clear(); + block_list_.clear(); + block_cache_.clear(); + in_use_size_ = 0; + cache_size_ = 0; + } + + bool free(void* ptr) { + if (ptr == nullptr) return true; + + uintptr_t base = reinterpret_cast(ptr); + + // Find fragment and validate. + auto frag_map_it = block_list_.upper_bound(base); + if (frag_map_it == block_list_.begin()) return false; + frag_map_it--; + auto& frag_map = frag_map_it->second; + auto fragment = frag_map.find(base); + if (fragment == frag_map.end() || isFree(fragment->second)) return false; + + bool discard = fragment->second.discard; + + // Merge lower + if (fragment != frag_map.begin()) { + auto lower = fragment; + lower--; + if (isFree(lower->second)) { + removeFreeListEntry(lower->second); + lower->second.size += fragment->second.size; + frag_map.erase(fragment); + fragment = lower; + } + } + + // Merge upper + { + auto upper = fragment; + upper++; + if ((upper != frag_map.end()) && isFree(upper->second)) { + removeFreeListEntry(upper->second); + fragment->second.size += upper->second.size; + frag_map.erase(upper); + } + } + + // Release whole free blocks. + if (frag_map.size() == 1) { + Block block(fragment->first, fragment->second.size); + block_list_.erase(frag_map_it); + + // Discard or add to the block cache. + if (discard) { + block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); + } else { + block_cache_.push_back(block); + cache_size_ += block.length_; + in_use_size_ -= block.length_; + } + + balance(); + + // Don't publish free space since block was moved to the cache. + return true; + } + + // Don't report free memory if discarding the fragment. + if (discard) return true; + + // Report free fragment + const auto& freeEntry = + free_list_.insert(std::make_pair(size_t(fragment->second.size), fragment->first)); + setFree(fragment->second, freeEntry); + + return true; + } + + void balance() { + // Release old blocks when over cache limit. + while ((block_cache_.size() > 1) && (cache_size_ > in_use_size_ * 2)) { + const auto& block = block_cache_.front(); + block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); + cache_size_ -= block.length_; + block_cache_.pop_front(); + } + } + + void trim() { + for (const auto& block : block_cache_) + block_allocator_.free(reinterpret_cast(block.base_ptr_), block.length_); + block_cache_.clear(); + cache_size_ = 0; + } + + size_t cache_size() const { return cache_size_; } + + size_t default_block_size() const { return block_allocator_.block_size(); } + + // Prevent reuse of the block containing ptr. No further fragments will be allocated from the + // block and the block will not be added to the block cache when it is free. + bool discardBlock(void* ptr) { + if (ptr == nullptr) return true; + + uintptr_t base = reinterpret_cast(ptr); + + // Find block validate. + auto frag_map_it = block_list_.upper_bound(base); + if (frag_map_it == block_list_.begin()) return false; + frag_map_it--; + auto& frag_map = frag_map_it->second; + if ((base < frag_map.begin()->first) || + (frag_map.rbegin()->first + frag_map.rbegin()->second.size <= base)) + return false; + + // Is block already discarded? + if (frag_map.begin()->second.discard) return true; + + // Mark all fragments for discard and compute block size. Removes freelist records for all + // fragments in the block. + size_t size = 0; + for (auto& frag : frag_map) { + discard(frag.second); + size += frag.second.size; + } + + // Remove discarded block from in-use tracking and rebalance the block cache. + in_use_size_ -= size; + balance(); + + return true; + } +}; + +} // namespace wsl + +#endif // HSA_RUNTME_CORE_UTIL_SIMPLE_HEAP_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.cpp new file mode 100644 index 0000000000..bcaef5dd87 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.cpp @@ -0,0 +1,185 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "small_heap.h" + +namespace wsl { + +// Inserts node into freelist after place. +// Assumes node will not be an end of the list (list has guard nodes). +void SmallHeap::insertafter(SmallHeap::iterator_t place, SmallHeap::iterator_t node) { + assert(place->first < node->first && "Order violation"); + assert(isfree(place->second) && "Freelist operation error."); + iterator_t next = place->second.next; + node->second.next = next; + node->second.prior = place; + place->second.next = node; + next->second.prior = node; +} + +// Removes node from freelist. +// Assumes node will not be an end of the list (list has guard nodes). +void SmallHeap::remove(SmallHeap::iterator_t node) { + assert(isfree(node->second) && "Freelist operation error."); + node->second.prior->second.next = node->second.next; + node->second.next->second.prior = node->second.prior; + setused(node->second); +} + +// Returns high if merge failed or the merged node. +SmallHeap::memory_t::iterator SmallHeap::merge(SmallHeap::memory_t::iterator low, + SmallHeap::memory_t::iterator high) { + assert(isfree(low->second) && "Merge with allocated block"); + assert(isfree(high->second) && "Merge with allocated block"); + + if ((char*)low->first + low->second.len != (char*)high->first) return high; + + assert(!islastfree(high->second) && "Illegal merge."); + + low->second.len += high->second.len; + low->second.next = high->second.next; + high->second.next->second.prior = low; + + memory.erase(high); + return low; +} + +void SmallHeap::free(void* ptr) { + if (ptr == nullptr) return; + + auto iterator = memory.find(ptr); + + // Check for illegal free + if (iterator == memory.end()) { + assert(false && "Illegal free."); + return; + } + + // Return memory to total and link node into free list + total_free += iterator->second.len; + + // Could also traverse the free list which might be faster in some cases. + auto before = iterator; + before--; + while (!isfree(before->second)) before--; + assert(before->second.next->first > iterator->first && "Inconsistency in small heap."); + insertafter(before, iterator); + + // Attempt compaction + iterator = merge(before, iterator); + merge(iterator, iterator->second.next); + + // Update lowHighBondary + high.erase(ptr); +} + +void* SmallHeap::alloc(size_t bytes) { + // Is enough memory available? + if ((bytes > total_free) || (bytes == 0)) return nullptr; + + iterator_t current; + + // Walk the free list and allocate at first fitting location + current = firstfree(); + while (!islastfree(current->second)) { + if (bytes <= current->second.len) { + // Decrement from total + total_free -= bytes; + + // Split node + if (bytes != current->second.len) { + void* remaining = (char*)current->first + bytes; + Node& node = memory[remaining]; + node.len = current->second.len - bytes; + current->second.len = bytes; + insertafter(current, memory.find(remaining)); + } + + remove(current); + return current->first; + } + current = current->second.next; + } + assert(current->second.len == 0 && "Freelist corruption."); + + // Can't service the request due to fragmentation + return nullptr; +} + +void* SmallHeap::alloc_high(size_t bytes) { + // Is enough memory available? + if ((bytes > total_free) || (bytes == 0)) return nullptr; + + iterator_t current; + + // Walk the free list and allocate at first fitting location + current = lastfree(); + while (!isfirstfree(current->second)) { + if (bytes <= current->second.len) { + // Decrement from total + total_free -= bytes; + + void* alloc; + // Split node + if (bytes != current->second.len) { + alloc = (char*)current->first + current->second.len - bytes; + current->second.len -= bytes; + Node& node = memory[alloc]; + node.len = bytes; + setused(node); + } else { + alloc = current->first; + remove(current); + } + + high.insert(alloc); + return alloc; + } + current = current->second.prior; + } + assert(current->second.len == 0 && "Freelist corruption."); + + // Can't service the request due to fragmentation + return nullptr; +} + +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.h new file mode 100644 index 0000000000..f6e060cb09 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/small_heap.h @@ -0,0 +1,131 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// A simple first fit memory allocator with eager compaction. For use with few +// items (where list iteration is faster than trees). +// Not thread safe! + +#ifndef HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_ +#define HSA_RUNTME_CORE_UTIL_SMALL_HEAP_H_ + +#include +#include + +#include "utils.h" + +namespace wsl { + +class SmallHeap { + private: + struct Node; + typedef std::map memory_t; + typedef memory_t::iterator iterator_t; + + struct Node { + size_t len; + iterator_t next; + iterator_t prior; + }; + + SmallHeap(const SmallHeap& rhs) = delete; + SmallHeap& operator=(const SmallHeap& rhs) = delete; + + void* const pool; + const size_t length; + + size_t total_free; + memory_t memory; + std::set high; + + __forceinline bool isfree(const Node& node) const { return node.next != memory.begin(); } + __forceinline bool islastfree(const Node& node) const { return node.next == memory.end(); } + __forceinline bool isfirstfree(const Node& node) const { return node.prior == memory.end(); } + __forceinline void setlastfree(Node& node) { node.next = memory.end(); } + __forceinline void setfirstfree(Node& node) { node.prior = memory.end(); } + __forceinline void setused(Node& node) { node.next = memory.begin(); } + + __forceinline iterator_t firstfree() { return memory.begin()->second.next; } + __forceinline iterator_t lastfree() { return memory.rbegin()->second.prior; } + void insertafter(iterator_t place, iterator_t node); + void remove(iterator_t node); + iterator_t merge(iterator_t low, iterator_t high); + + public: + SmallHeap() : pool(nullptr), length(0), total_free(0) {} + SmallHeap(void* base, size_t length) + : pool(base), length(length), total_free(length) { + assert(pool != nullptr && "Invalid base address."); + assert(pool != (void*)0xFFFFFFFFFFFFFFFFull && "Invalid base address."); + assert((char*)pool + length != (char*)0xFFFFFFFFFFFFFFFFull && "Invalid pool bounds."); + + Node& start = memory[0]; + Node& node = memory[pool]; + Node& end = memory[(void*)0xFFFFFFFFFFFFFFFFull]; + + start.len = 0; + start.next = memory.find(pool); + setfirstfree(start); + + node.len = length; + node.prior = memory.begin(); + node.next = --memory.end(); + + end.len = 0; + end.prior = start.next; + setlastfree(end); + + high.insert((void*)0xFFFFFFFFFFFFFFFFull); + } + + void* alloc(size_t bytes); + void* alloc_high(size_t bytes); + void free(void* ptr); + + void* base() const { return pool; } + size_t size() const { return length; } + size_t remaining() const { return total_free; } + void* high_split() const { return *high.begin(); } +}; + +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.cpp new file mode 100644 index 0000000000..c5a2b57c64 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.cpp @@ -0,0 +1,111 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/util/timer.h" + +namespace wsl { +namespace timer { + +accurate_clock::init::init() { + freq = os::AccurateClockFrequency(); + accurate_clock::period_ns = 1e9 / double(freq); +} + +// Calibrates the fast clock using the accurate clock. +fast_clock::init::init() { + typedef accurate_clock clock; + clock::duration delay(std::chrono::milliseconds(1)); + + // calibrate clock + fast_clock::raw_rep min = 0; + clock::duration elapsed; + + do { + elapsed = clock::duration::max(); + + for (int t = 0; t < 10; t++) { + fast_clock::raw_rep r1, r2; + clock::time_point t0, t1, t2, t3; + + t0 = clock::now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + r1 = fast_clock::raw_now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + t1 = clock::now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + + do { + t2 = clock::now(); + } while (t2 - t1 < delay); + + std::atomic_signal_fence(std::memory_order_acq_rel); + r2 = fast_clock::raw_now(); + std::atomic_signal_fence(std::memory_order_acq_rel); + t3 = clock::now(); + + // If elapsed time is shorter than last recorded time and both the start + // and end times are confirmed correlated then record the clock readings. + // This protects against inaccuracy due to thread switching + if ((t3 - t1 < elapsed) && ((t1 - t0) * 10 < (t2 - t1)) && + ((t3 - t2) * 10 < (t2 - t1))) { + elapsed = t3 - t1; + min = r2 - r1; + } + } + delay += delay; + } while (min < 1000); + + fast_clock::freq = double(min) / duration_in_seconds(elapsed); + fast_clock::period_ps = 1e12 / fast_clock::freq; + // printf("Timer setup took %f ms\n", duration_in_seconds(elapsed)*1000.0f); + // printf("Fast clock frequency: %f MHz\n", double(fast_clock::freq)/1e6); +} + +double accurate_clock::period_ns; +accurate_clock::raw_frequency accurate_clock::freq; +accurate_clock::init accurate_clock::accurate_clock_init; + +double fast_clock::period_ps; +fast_clock::raw_frequency fast_clock::freq; +fast_clock::init fast_clock::fast_clock_init; +} // namespace timer +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.h new file mode 100644 index 0000000000..3012685113 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/timer.h @@ -0,0 +1,173 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_UTIL_TIMER_H_ +#define HSA_RUNTIME_CORE_UTIL_TIMER_H_ + +#include "core/util/utils.h" +#include "core/util/os.h" +#include +#include +#include + +namespace wsl { +namespace timer { + +// Needed to patch around a mixed arithmetic bug in MSVC's duration_cast as of +// VS 2013. +template +struct wide_type { + typedef double type; +}; +template <> +struct wide_type { + typedef uintmax_t type; +}; +template <> +struct wide_type { + typedef intmax_t type; +}; + +template +static __forceinline To + duration_cast(const std::chrono::duration& d) { + typedef typename wide_type::value, + std::is_signed::value>::type wide; + typedef std::chrono::duration unit_convert_t; + + unit_convert_t temp = std::chrono::duration_cast(d); + return To(static_cast(temp.count())); +} +// End patch + +template +static __forceinline double duration_in_seconds( + std::chrono::duration delta) { + typedef std::chrono::duration> seconds; + return seconds(delta).count(); +} + +template +static __forceinline rep duration_from_seconds(double delta) { + typedef std::chrono::duration> seconds; + return std::chrono::duration_cast(seconds(delta)); +} + +// Provices a C++11 standard clock interface to the os::AccurateClock functions +class accurate_clock { + public: + typedef double rep; + typedef std::nano period; + typedef std::chrono::duration duration; + typedef std::chrono::time_point time_point; + + static const bool is_steady = true; + + static __forceinline time_point now() { + return time_point(duration(raw_now() * period_ns)); + } + + // These two extra APIs and types let us use clocks without conversion to the + // arbitrary period unit + typedef uint64_t raw_rep; + typedef uint64_t raw_frequency; + + static __forceinline raw_rep raw_now() { return os::ReadAccurateClock(); } + static __forceinline raw_frequency raw_freq() { return freq; } + + private: + static double period_ns; + static raw_frequency freq; + + class init { + public: + init(); + }; + static init accurate_clock_init; +}; + +// Provices a C++11 standard clock interface to the lowest latency approximate +// clock +class fast_clock { + public: + typedef double rep; + typedef std::pico period; + typedef std::chrono::duration duration; + typedef std::chrono::time_point time_point; + + static const bool is_steady = true; + + static __forceinline time_point now() { + return time_point(duration(raw_now() * period_ps)); + } + + // These two extra APIs and types let us use clocks without conversion to the + // arbitrary period unit + typedef uint64_t raw_rep; + typedef double raw_frequency; + +#if defined(__x86_64__) || defined(_M_X64) + static __forceinline raw_rep raw_now() { return __rdtsc(); } + static __forceinline raw_frequency raw_freq() { return freq; } +#else + static __forceinline raw_rep raw_now() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return (raw_rep(ts.tv_sec) * 1000000000 + raw_rep(ts.tv_nsec)); + } + static __forceinline raw_frequency raw_freq() { return 1.e-9; } +#endif + + private: + static double period_ps; + static raw_frequency freq; + + class init { + public: + init(); + }; + static init fast_clock_init; +}; +} // namespace timer +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/utils.h b/projects/rocr-runtime/libhsakmt/src/dxg/util/utils.h new file mode 100644 index 0000000000..15d61a87e1 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/utils.h @@ -0,0 +1,389 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Generally useful utility functions + +#ifndef HSA_RUNTIME_CORE_UTIL_UTILS_H_ +#define HSA_RUNTIME_CORE_UTIL_UTILS_H_ + +#include "stdint.h" +#include "stddef.h" +#include "stdlib.h" +#include "stdarg.h" +#include "unistd.h" +#include +#include +#include +#include +#include +#include + +namespace wsl { +extern FILE* log_file; +extern uint8_t log_flags[8]; + +typedef unsigned int uint; +typedef uint64_t uint64; + +#if defined(__GNUC__) +#if defined(__i386__) || defined(__x86_64__) +#include +#endif + +// 2MB huge page size +#define GPU_HUGE_PAGE_SIZE (2 << 20) + +// 4KB page size +#define DEFAULT_GPU_PAGE_SIZE (1 << 12) + +#define __forceinline __inline__ __attribute__((always_inline)) +#define __declspec(x) __attribute__((x)) +#undef __stdcall +#define __stdcall // __attribute__((__stdcall__)) +#define __ALIGNED__(x) __attribute__((aligned(x))) + +void log_printf(const char* file, int line, const char* format, ...); + +static __forceinline void* _aligned_malloc(size_t size, size_t alignment) { +#ifdef _ISOC11_SOURCE + return aligned_alloc(alignment, size); +#else + void *mem = NULL; + if (0 != posix_memalign(&mem, alignment, size)) return NULL; + return mem; +#endif +} +static __forceinline void _aligned_free(void* ptr) { return free(ptr); } +#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) +#include "intrin.h" +#define __ALIGNED__(x) __declspec(align(x)) +#if (_MSC_VER < 1800) // < VS 2013 +static __forceinline unsigned long long int strtoull(const char* str, + char** endptr, int base) { + return static_cast(_strtoui64(str, endptr, base)); +} +#endif +#if (_MSC_VER < 1900) // < VS 2015 +#define thread_local __declspec(thread) +#endif +#else +#error "Compiler and/or processor not identified." +#endif + +#define STRING2(x) #x +#define STRING(x) STRING2(x) + +#define PASTE2(x, y) x##y +#define PASTE(x, y) PASTE2(x, y) + +#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) + +#define LogPrint(flag, format, ...) \ + do { \ + if (hsa_flag_isset64(log_flags, flag)) \ + wsl::log_printf(__FILENAME__, __LINE__, format, ##__VA_ARGS__); \ + } while (false); + +// A macro to disallow the copy and move constructor and operator= functions +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + TypeName(TypeName&&) = delete; \ + void operator=(const TypeName&) = delete; \ + void operator=(TypeName&&) = delete; + +template +class ScopeGuard { + public: + explicit __forceinline ScopeGuard(const lambda& release) + : release_(release), dismiss_(false) {} + + ScopeGuard(ScopeGuard& rhs) { *this = rhs; } + + __forceinline ~ScopeGuard() { + if (!dismiss_) release_(); + } + __forceinline ScopeGuard& operator=(ScopeGuard& rhs) { + dismiss_ = rhs.dismiss_; + release_ = rhs.release_; + rhs.dismiss_ = true; + return *this; + } + __forceinline void Dismiss() { dismiss_ = true; } + + private: + lambda release_; + bool dismiss_; +}; + +template +static __forceinline ScopeGuard MakeScopeGuard(lambda rel) { + return ScopeGuard(rel); +} + +#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \ + auto lname = __VA_ARGS__; \ + ScopeGuard sname(lname); +#define MAKE_SCOPE_GUARD(...) \ + MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), \ + PASTE(scopeGuard, __COUNTER__), __VA_ARGS__) +#define MAKE_NAMED_SCOPE_GUARD(name, ...) \ + MAKE_SCOPE_GUARD_HELPER(PASTE(scopeGuardLambda, __COUNTER__), name, \ + __VA_ARGS__) + +/// @brief: Finds out the min one of two inputs, input must support ">" +/// operator. +/// @param: a(Input), a reference to type T. +/// @param: b(Input), a reference to type T. +/// @return: T. +template +static __forceinline T Min(const T& a, const T& b) { + return (a > b) ? b : a; +} + +template +static __forceinline T Min(const T& a, const T& b, Arg... args) { + return Min(a, Min(b, args...)); +} + +/// @brief: Find out the max one of two inputs, input must support ">" operator. +/// @param: a(Input), a reference to type T. +/// @param: b(Input), a reference to type T. +/// @return: T. +template +static __forceinline T Max(const T& a, const T& b) { + return (b > a) ? b : a; +} + +template +static __forceinline T Max(const T& a, const T& b, Arg... args) { + return Max(a, Max(b, args...)); +} + +/// @brief: Free the memory space which is newed previously. +/// @param: ptr(Input), a pointer to memory space. Can't be NULL. +/// @return: void. +struct DeleteObject { + template + void operator()(const T* ptr) const { + delete ptr; + } +}; + +/// @brief: Checks if a value is power of two, if it is, return true. Be careful +/// when passing 0. +/// @param: val(Input), the data to be checked. +/// @return: bool. +template +static __forceinline bool IsPowerOfTwo(T val) { + return (val & (val - 1)) == 0; +} + +/// @brief: Calculates the floor value aligned based on parameter of alignment. +/// If value is at the boundary of alignment, it is unchanged. +/// @param: value(Input), value to be calculated. +/// @param: alignment(Input), alignment value. +/// @return: T. +template +static __forceinline T AlignDown(T value, size_t alignment) { + return (T)((value / alignment) * alignment); +} + +/// @brief: Same as previous one, but first parameter becomes pointer, for more +/// info, see the previous desciption. +/// @param: value(Input), pointer to type T. +/// @param: alignment(Input), alignment value. +/// @return: T*, pointer to type T. +template +static __forceinline T* AlignDown(T* value, size_t alignment) { + return (T*)AlignDown((intptr_t)value, alignment); +} + +/// @brief: Calculates the ceiling value aligned based on parameter of +/// alignment. +/// If value is at the boundary of alignment, it is unchanged. +/// @param: value(Input), value to be calculated. +/// @param: alignment(Input), alignment value. +/// @param: T. +template +static __forceinline T AlignUp(T value, size_t alignment) { + return AlignDown((T)(value + alignment - 1), alignment); +} + +/// @brief: Same as previous one, but first parameter becomes pointer, for more +/// info, see the previous desciption. +/// @param: value(Input), pointer to type T. +/// @param: alignment(Input), alignment value. +/// @return: T*, pointer to type T. +template +static __forceinline T* AlignUp(T* value, size_t alignment) { + return (T*)AlignDown((intptr_t)((uint8_t*)value + alignment - 1), alignment); +} + +/// @brief: Checks if the input value is at the boundary of alignment, if it is, +/// @return true. +/// @param: value(Input), value to be checked. +/// @param: alignment(Input), alignment value. +/// @return: bool. +template +static __forceinline bool IsMultipleOf(T value, size_t alignment) { + return (AlignUp(value, alignment) == value); +} + +/// @brief: Same as previous one, but first parameter becomes pointer, for more +/// info, see the previous desciption. +/// @param: value(Input), pointer to type T. +/// @param: alignment(Input), alignment value. +/// @return: bool. +template +static __forceinline bool IsMultipleOf(T* value, size_t alignment) { + return (AlignUp(value, alignment) == value); +} + +static __forceinline uint32_t NextPow2(uint32_t value) { + if (value == 0) return 1; + uint32_t v = value - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +static __forceinline uint64_t NextPow2(uint64_t value) { + if (value == 0) return 1; + uint64_t v = value - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + return v + 1; +} + +static __forceinline bool strIsEmpty(const char* str) noexcept { return str[0] == '\0'; } + +static __forceinline std::string& ltrim(std::string& s) { + auto it = std::find_if(s.begin(), s.end(), + [](char c) { return !std::isspace(c, std::locale::classic()); }); + s.erase(s.begin(), it); + return s; +} + +static __forceinline std::string& rtrim(std::string& s) { + auto it = std::find_if(s.rbegin(), s.rend(), + [](char c) { return !std::isspace(c, std::locale::classic()); }); + s.erase(it.base(), s.end()); + return s; +} + +static __forceinline std::string& trim(std::string& s) { return ltrim(rtrim(s)); } + +} // namespace wsl + +template +static __forceinline uint32_t BitSelect(T p) { + static_assert(sizeof(T) <= sizeof(uintptr_t), "Type out of range."); + static_assert(highBit < sizeof(uintptr_t) * 8, "Bit index out of range."); + + uintptr_t ptr = p; + if (highBit != (sizeof(uintptr_t) * 8 - 1)) + return (uint32_t)((ptr & ((1ull << (highBit + 1)) - 1)) >> lowBit); + else + return (uint32_t)(ptr >> lowBit); +} + +inline uint32_t PtrLow16Shift8(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint32_t)((ptr & 0xFFFFULL) >> 8); +} + +inline uint32_t PtrHigh64Shift16(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint32_t)((ptr & 0xFFFFFFFFFFFF0000ULL) >> 16); +} + +inline uint32_t PtrLow40Shift8(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8); +} + +inline uint32_t PtrHigh64Shift40(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint32_t)((ptr & 0xFFFFFF0000000000ULL) >> 40); +} + +static inline uint8_t Ptr48High8(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint8_t)((ptr & 0xFF0000000000ULL) >> 40); +} + +static inline uint32_t Ptr48Low32(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + assert((ptr & 0xFFFFFFFFFF00ULL) == ptr); + return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8); +} + +inline uint32_t PtrLow32(const void* p) { + return static_cast(reinterpret_cast(p)); +} + +inline uint32_t PtrHigh32(const void* p) { + uint32_t ptr = 0; +#ifdef HSA_LARGE_MODEL + ptr = static_cast(reinterpret_cast(p) >> 32); +#endif + return ptr; +} + +inline uint32_t HighPart(uint64_t value) { + return (value & 0xFFFFFFFF00000000) >> 32; +} + +inline uint32_t LowPart(uint64_t value) { + return (value & 0x00000000FFFFFFFF); +} + +#include "atomic_helpers.h" + +#endif // HSA_RUNTIME_CORE_UTIL_UTILS_H_ diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/util/win/os_win.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/util/win/os_win.cpp new file mode 100644 index 0000000000..b7f2285623 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/util/win/os_win.cpp @@ -0,0 +1,327 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifdef _WIN32 // Are we compiling for windows? +#define NOMINMAX + +#include "core/util/os.h" + +#include +#include +#include +#include + +#include +#include +#include + +#undef Yield +#undef CreateMutex + +namespace wsl { +namespace os { + +static_assert(sizeof(LibHandle) == sizeof(HMODULE), + "OS abstraction size mismatch"); +static_assert(sizeof(LibHandle) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(Semaphore) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(Mutex) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(Thread) == sizeof(::HANDLE), + "OS abstraction size mismatch"); +static_assert(sizeof(EventHandle) == sizeof(::HANDLE), + "OS abstraction size mismatch"); + +LibHandle LoadLib(std::string filename) { + HMODULE ret = LoadLibrary(filename.c_str()); + return *(LibHandle*)&ret; +} + +void* GetExportAddress(LibHandle lib, std::string export_name) { + return GetProcAddress(*(HMODULE*)&lib, export_name.c_str()); +} + +void CloseLib(LibHandle lib) { FreeLibrary(*(::HMODULE*)&lib); } + +std::vector GetLoadedLibs() { + // Use EnumProcessModulesEx + static_assert(false, "Not implemented."); +} + +std::string GetLibraryName(LibHandle lib) { + static_assert(false, "Not implemented."); +} + +Semaphore CreateSemaphore() { + sem = static_cast(CreateSemaphore(NULL, 0, LONG_MAX, NULL)); + assert(sem != NULL && "CreateSemaphore failed"); + + return *(Semaphore*)&sem; +} + +bool WaitSemaphore(Semaphore sem) { + return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0; +} + +void PostSemaphore(Semaphore sem) { + ReleaseSemaphore(static_cast(*sem), 1, NULL); +} + +void DestroySemaphore(Semaphore sem) { + if (!CloseHandle(static_cast(*sem))) { + assert("CloseHandle() failed"); + } + *sem = NULL; +} + +Mutex CreateMutex() { return CreateEvent(NULL, false, true, NULL); } + +bool TryAcquireMutex(Mutex lock) { + return WaitForSingleObject(*(::HANDLE*)&lock, 0) == WAIT_OBJECT_0; +} + +bool AcquireMutex(Mutex lock) { + return WaitForSingleObject(*(::HANDLE*)&lock, INFINITE) == WAIT_OBJECT_0; +} + +void ReleaseMutex(Mutex lock) { SetEvent(*(::HANDLE*)&lock); } + +void DestroyMutex(Mutex lock) { CloseHandle(*(::HANDLE*)&lock); } + +void Sleep(int delay_in_millisecond) { ::Sleep(delay_in_millisecond); } + +void uSleep(int delayInUs) { ::Sleep(delayInUs / 1000); } + +void YieldThread() { ::Sleep(0); } + +struct ThreadArgs { + void* entry_args; + ThreadEntry entry_function; +}; + +unsigned __stdcall ThreadTrampoline(void* arg) { + ThreadArgs* thread_args = (ThreadArgs*)arg; + ThreadEntry entry = thread_args->entry_function; + void* data = thread_args->entry_args; + delete thread_args; + entry(data); + _endthreadex(0); + return 0; +} + +Thread CreateThread(ThreadEntry entry_function, void* entry_argument, + uint stack_size) { + ThreadArgs* thread_args = new ThreadArgs(); + thread_args->entry_args = entry_argument; + thread_args->entry_function = entry_function; + uintptr_t ret = + _beginthreadex(NULL, stack_size, ThreadTrampoline, thread_args, 0, NULL); + return *(Thread*)&ret; +} + +void CloseThread(Thread thread) { CloseHandle(*(::HANDLE*)&thread); } + +bool WaitForThread(Thread thread) { + return WaitForSingleObject(*(::HANDLE*)&thread, INFINITE) == WAIT_OBJECT_0; +} + +bool WaitForAllThreads(Thread* threads, uint thread_count) { + return WaitForMultipleObjects(thread_count, threads, TRUE, INFINITE) == + WAIT_OBJECT_0; +} + +void SetEnvVar(std::string env_var_name, std::string env_var_value) { + SetEnvironmentVariable(env_var_name.c_str(), env_var_value.c_str()); +} + +std::string GetEnvVar(std::string env_var_name) { + char* buff; + DWORD char_count = GetEnvironmentVariable(env_var_name.c_str(), NULL, 0); + if (char_count == 0) return ""; + buff = (char*)alloca(sizeof(char) * char_count); + GetEnvironmentVariable(env_var_name.c_str(), buff, char_count); + buff[char_count - 1] = '\0'; + std::string ret = buff; + return ret; +} + +size_t GetUserModeVirtualMemorySize() { + SYSTEM_INFO system_info = {0}; + GetSystemInfo(&system_info); + return ((size_t)system_info.lpMaximumApplicationAddress + 1); +} + +size_t GetUsablePhysicalHostMemorySize() { + MEMORYSTATUSEX memory_status = {0}; + memory_status.dwLength = sizeof(memory_status); + if (GlobalMemoryStatusEx(&memory_status) == 0) { + return 0; + } + + const size_t physical_size = static_cast(memory_status.ullTotalPhys); + return std::min(GetUserModeVirtualMemorySize(), physical_size); +} + +uintptr_t GetUserModeVirtualMemoryBase() { return (uintptr_t)0; } + +// Os event wrappers +EventHandle CreateOsEvent(bool auto_reset, bool init_state) { + EventHandle evt = reinterpret_cast( + CreateEvent(NULL, (BOOL)(!auto_reset), (BOOL)init_state, NULL)); + return evt; +} + +int DestroyOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + return CloseHandle(reinterpret_cast<::HANDLE>(event)); +} + +int WaitForOsEvent(EventHandle event, unsigned int milli_seconds) { + if (event == NULL) { + return -1; + } + + int ret_code = + WaitForSingleObject(reinterpret_cast<::HANDLE>(event), milli_seconds); + if (ret_code == WAIT_TIMEOUT) { + ret_code = 0x14003; // 0x14003 indicates timeout + } + return ret_code; +} + +int SetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + return SetEvent(reinterpret_cast<::HANDLE>(event)); +} + +int ResetOsEvent(EventHandle event) { + if (event == NULL) { + return -1; + } + return ResetEvent(reinterpret_cast<::HANDLE>(event)); +} + +uint64_t ReadAccurateClock() { + uint64_t ret; + QueryPerformanceCounter((LARGE_INTEGER*)&ret); + return ret; +} + +uint64_t AccurateClockFrequency() { + uint64_t ret; + QueryPerformanceFrequency((LARGE_INTEGER*)&ret); + return ret; +} + +SharedMutex CreateSharedMutex() { + assert(false && "Not implemented."); + abort(); + return nullptr; +} + +bool TryAcquireSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); + return false; +} + +bool AcquireSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); + return false; +} + +void ReleaseSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); +} + +bool TrySharedAcquireSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); + return false; +} + +bool SharedAcquireSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); + return false; +} + +void SharedReleaseSharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); +} + +void DestroySharedMutex(SharedMutex lock) { + assert(false && "Not implemented."); + abort(); +} + +uint64_t ReadSystemClock() { + assert(false && "Not implemented."); + abort(); + return 0; +} + +uint64_t SystemClockFrequency() { + assert(false && "Not implemented."); + abort(); + return 0; +} + +bool ParseCpuID(cpuid_t* cpuinfo) { + assert(false && "Not implemented."); + abort(); + return false; +} + +} // namespace os +} // namespace wsl + +#endif diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/version.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/version.cpp new file mode 100644 index 0000000000..80dc67d44f --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/version.cpp @@ -0,0 +1,36 @@ +/* + * Copyright © 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including + * the next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +const char rocdxgbuildid[] __attribute__((used)) = "ROCDXG BUILD ID: " STRING(ROCDXG_VERSION); + +HSAKMT_STATUS HSAKMTAPI hsaKmtGetVersion(HsaVersionInfo *VersionInfo) { + CHECK_DXG_OPEN(); + + VersionInfo->KernelInterfaceMajorVersion = 1; + VersionInfo->KernelInterfaceMinorVersion = 17; + + return HSAKMT_STATUS_SUCCESS; +} diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/cmd_util.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/cmd_util.cpp new file mode 100644 index 0000000000..d650651e31 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/cmd_util.cpp @@ -0,0 +1,320 @@ +/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. */ + +#include "impl/wddm/cmd_util.h" + +namespace wsl { +namespace thunk { + +/* + * Builds a COPY_DATA packet that copies data. + */ +size_t CmdUtil::BuildCopyData( + uint64_t *pDstAddr, + void *pBuffer, + uint32_t dstSel, + uint32_t dstCachePolicy, + uint32_t srcSel, + uint32_t srcCachePolicy, + uint32_t countSel, + uint32_t wrConfirm) { + PM4MEC_COPY_DATA copy_data = {0}; + + GenerateCmdHeader(©_data, IT_COPY_DATA); + copy_data.bitfields2.dst_sel = dstSel; + copy_data.bitfields2.src_sel = srcSel; + copy_data.bitfields2.dst_cache_policy = dstCachePolicy; + copy_data.bitfields2.src_cache_policy = srcCachePolicy; + copy_data.bitfields2.count_sel = countSel; + copy_data.bitfields2.wr_confirm = wrConfirm; + copy_data.bitfields5c.dst_64b_addr_lo = (PtrLow32(pDstAddr) >> 3); + copy_data.dst_addr_hi = PtrHigh32(pDstAddr); + memcpy(pBuffer, ©_data, sizeof(copy_data)); + + return sizeof(copy_data); +} + +/* + * Builds a EVENT_WRITE packet. + * Applications can use Barrier command to ensure their + * command is executed only after all other commands have + * completed their execution. + */ +size_t CmdUtil::BuildBarrier( + void *pBuffer, + uint32_t eventIndex, + uint32_t eventType) { + BarrierTemplate barrier = {0}; + + GenerateCmdHeader(&barrier.event_write, IT_EVENT_WRITE); + barrier.event_write.bitfields2.event_index = eventIndex; + barrier.event_write.bitfields2.event_type = eventType; + memcpy(pBuffer, &barrier, sizeof(barrier)); + + return sizeof(barrier); +} + +/** + * Builds a WRITE_DATA packet. + * Writes two DWORDs into the GPU memory address "write_addr" + */ + +size_t CmdUtil::BuildWriteData64Command( + void* pBuffer, + uint64_t* write_addr, + uint64_t write_value) { + WriteDataTemplate command = {0}; + GenerateCmdHeader(&command.write_data, IT_WRITE_DATA); + + // Encode the user specified address to write to + uint64_t addr = uintptr_t(write_addr); + assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned"); + + // Set the bit to confirm the write operation and cache policy + command.write_data.bitfields2.wr_confirm = wr_confirm__mec_write_data__wait_for_write_confirmation; + command.write_data.bitfields2.cache_policy = cache_policy__mec_write_data__bypass; + + // Specify the command to increment address if writing more than one DWord + command.write_data.bitfields2.addr_incr = addr_incr__mec_write_data__increment_address; + // Specify the class to which the write destination belongs + command.write_data.bitfields2.dst_sel = dst_sel__mec_write_data__memory; + + command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2); + command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr); + + // Specify the value to write + command.write_data.write_data_value = write_value; + + memcpy(pBuffer, &command, sizeof(command)); + return sizeof(command); +} + +/* + * Builds a ACQUIRE_MEM packet. + * Users can submit this command to + * invalidate Gpu caches - L1 and or L2. + */ +size_t CmdUtil::BuildAcquireMem( + uint8_t major, + void *pBuffer) { + size_t ret; + if (major == 9) { + gfx9::AcquireMemTemplate acq = {0}; + GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM); + // Specify the size of memory to invalidate. Size is + // specified in terms of 256 byte chunks. A coher_size + // of 0xFFFFFFFF actually specified 0xFFFFFFFF00 (40 bits) + // of memory. The field coher_size_hi specifies memory from + // bits 40-64 for a total of 256 TB. + acq.acquire_mem.coher_size = 0xFFFFFFFF; + acq.acquire_mem.bitfields4.coher_size_hi = 0xFF; + // Specify the address of memory to invalidate. The + // address must be 256 byte aligned. + acq.acquire_mem.coher_base_lo = 0; + acq.acquire_mem.bitfields6.coher_base_hi = 0; + // Specify the poll interval for determing if operation is complete + acq.acquire_mem.bitfields7.poll_interval = 4; + acq.acquire_mem.bitfields2.coher_cntl = + (1 << 29) | // CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK + (1 << 27) | // CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK + (1 << 28); // CP_COHER_CNTL__SH_KCACHE_VOL_ACTION_ENA_MASK + memcpy(pBuffer, &acq, sizeof(acq)); + ret = sizeof(acq); + } else if (major >= 10) { + gfx10::AcquireMemTemplate acq = {0}; + GenerateCmdHeader(&acq.acquire_mem, IT_ACQUIRE_MEM); + acq.acquire_mem.coher_size = 0xFFFFFFFF; + acq.acquire_mem.bitfields4.coher_size_hi = 0xFF; + acq.acquire_mem.coher_base_lo = 0; + acq.acquire_mem.bitfields6.coher_base_hi = 0; + acq.acquire_mem.bitfields7.poll_interval = 4; + acq.acquire_mem.bitfields8.gcr_cntl = + (1 << 16) | // SEQ = FORWARD + (1 << 15) | // GL2_WB + (1 << 14) | // GL2_INV + (1 << 9) | // GL1_INV + (1 << 8) | // GLV_INV + (1 << 7) | // GLK_INV + (1 << 6) | // GLK_WB + (1 << 5) | // GLM_INV + (1 << 4) | // GLM_WB + (1 << 0); // GLI_INV = ALL + memcpy(pBuffer, &acq, sizeof(acq)); + ret = sizeof(acq); + } + + return ret; +} + +/* + * Builds a scratch packet. + */ +size_t CmdUtil::BuildScratch( + void *pScratchBase, + void *pBuffer) { + struct SetScratchTemplate scratch = {0}; + + GenerateSetShRegHeader(&scratch, mmCOMPUTE_DISPATCH_SCRATCH_BASE_LO); + scratch.scratch_lo = Ptr48Low32(pScratchBase); + scratch.scratch_hi = Ptr48High8(pScratchBase); + memcpy(pBuffer, &scratch, sizeof(scratch)); + + return sizeof(scratch); +} + +/** + * @ Set Compute Shader parameter for gfx11 and above + */ +size_t CmdUtil::BuildComputeShaderParams(void *pBuffer) { + struct DispatchProgramResourceRegs compute_shader_params = {0}; + + GenerateSetShRegHeader(&compute_shader_params, mmCOMPUTE_PGM_RSRC3); + // IMAGE_OP: Indicates the compute program contains an image op + // instruction and should be stalled by its WAIT_SYNC fence. + compute_shader_params.compute_pgm_rsrc3 = (1 << 31); + + memcpy(pBuffer, &compute_shader_params, sizeof(compute_shader_params)); + + return sizeof(compute_shader_params); +} + + +/* + * Builds a dispatch packet. + */ +size_t CmdUtil::BuildDispatch( + struct DispatchInfo *pInfo, + void *pBuffer) { + DispatchTemplate dispatch = {0}; + + GenerateSetShRegHeader(&dispatch.dimension_regs, mmCOMPUTE_NUM_THREAD_X); + dispatch.dimension_regs.compute_num_thread_x = pInfo->pPacket->workgroup_size_x; + dispatch.dimension_regs.compute_num_thread_y = pInfo->pPacket->workgroup_size_y; + dispatch.dimension_regs.compute_num_thread_z = pInfo->pPacket->workgroup_size_z; + + // TODO: Add AQL packet index for debugger + // Debugger requires AQL packet index in COMPUTE_DISPATCH_PKT_ADDR_LO + GenerateSetShRegHeader(&dispatch.program_regs, mmCOMPUTE_PGM_LO); + dispatch.program_regs.compute_pgm_lo = Ptr48Low32(pInfo->pEntry); + dispatch.program_regs.compute_pgm_hi = Ptr48High8(pInfo->pEntry); + + GenerateSetShRegHeader(&dispatch.program_resource_regs, mmCOMPUTE_PGM_RSRC1); + dispatch.program_resource_regs.compute_pgm_rsrc1 = pInfo->pKernelObject->compute_pgm_rsrc1; + if (pInfo->major == 11) { + AMD_HSA_BITS_SET(dispatch.program_resource_regs.compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 1); + } + dispatch.program_resource_regs.compute_pgm_rsrc2 = + (pInfo->ldsBlks << 15) | pInfo->pKernelObject->compute_pgm_rsrc2; + + GenerateSetShRegHeader(&dispatch.resource_regs, mmCOMPUTE_RESOURCE_LIMITS); + dispatch.resource_regs.compute_resource_limits = 0x3ff; + dispatch.resource_regs.compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + dispatch.resource_regs.compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + dispatch.resource_regs.compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + dispatch.resource_regs.compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + dispatch.resource_regs.compute_tmpring_size = pInfo->pAmdQueue->compute_tmpring_size; + + GenerateSetShRegHeader(&dispatch.compute_user_data_regs, mmCOMPUTE_USER_DATA_0); + + uint32_t sgpr_no = 0; + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) { + assert(pInfo->major < 11); + pInfo->scratchBaseOffset[pInfo->offsetCnt++] = + offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) + + sgpr_no * sizeof(uint32_t); + + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->pAmdQueue->scratch_resource_descriptor[0]; + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->pAmdQueue->scratch_resource_descriptor[1]; + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->pAmdQueue->scratch_resource_descriptor[2]; + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->srd; + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR)) { + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pPacket); + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pPacket); + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrLow32(pInfo->pAmdQueue); + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = PtrHigh32(pInfo->pAmdQueue); + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) { + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + PtrLow32(pInfo->pPacket->kernarg_address); + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + PtrHigh32(pInfo->pPacket->kernarg_address); + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID)) { + // This feature may be enabled as a side effect of indirect calls. + // However, the compiler team confirmed that the dispatch id itself is not used, + // so safe to send 0 for each dispatch. + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = 0; + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = 0; + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT)) { + assert(pInfo->major < 11); + pInfo->scratchBaseOffset[pInfo->offsetCnt++] = + offsetof(struct DispatchTemplate, compute_user_data_regs.compute_user_data[0]) + + sgpr_no * sizeof(uint32_t); + + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + PtrLow32(pInfo->pScratchBase); + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + PtrHigh32(pInfo->pScratchBase); + } + if (AMD_HSA_BITS_GET(pInfo->pKernelObject->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)) { + dispatch.compute_user_data_regs.compute_user_data[sgpr_no++] = + pInfo->scratchSizePerWave / (pInfo->wave32 ? 32 : 64); + } + + GenerateCmdHeader(&dispatch.dispatch_direct, IT_DISPATCH_DIRECT); + dispatch.dispatch_direct.dispatch_initiator = + (1 << 0) | // COMPUTE_SHADER_EN + (1 << 2) | // FORCE_START_AT_000 + (1 << 5); // USE_THREAD_DIMENSIONS + if (pInfo->wave32) dispatch.dispatch_direct.dispatch_initiator |= (1 << 15); // CS_W32_EN + dispatch.dispatch_direct.dim_x = pInfo->pPacket->grid_size_x; + dispatch.dispatch_direct.dim_y = pInfo->pPacket->grid_size_y; + dispatch.dispatch_direct.dim_z = pInfo->pPacket->grid_size_z; + memcpy(pBuffer, &dispatch, sizeof(dispatch)); + + return sizeof(dispatch); +} + +/* + * Builds a ATOMIC_MEM packet. + * Users can submit this command + * to perform atomic operations. + */ +size_t CmdUtil::BuildAtomicMem( + uint64_t *pAddr, + uint32_t atomic, + void *pBuffer, + uint32_t cachePolicy, + uint64_t srcData) { + AtomicTemplate atom = {0}; + + GenerateCmdHeader(&atom.atomic, IT_ATOMIC_MEM); + atom.atomic.addr_lo = PtrLow32(pAddr); + atom.atomic.addr_hi = PtrHigh32(pAddr); + atom.atomic.bitfields2.atomic = atomic; + atom.atomic.bitfields2.cache_policy = cachePolicy; + atom.atomic.src_data_lo = LowPart(srcData); + atom.atomic.src_data_hi = HighPart(srcData); + memcpy(pBuffer, &atom, sizeof(atom)); + + return sizeof(atom); +} + +} // namespace thunk +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/device.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/device.cpp new file mode 100644 index 0000000000..f51af85404 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/device.cpp @@ -0,0 +1,780 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include + +#include +#include +#include +#include +#include +#include +#include "impl/wddm/status.h" +#include "impl/wddm/types.h" +#include "impl/wddm/device.h" +#include "impl/wddm/queue.h" + +namespace wsl { +namespace thunk { + +const uint32_t WDDMDevice::cmdbuf_aql_frame_num_ = 0x1000; + +WDDMDevice::WDDMDevice(D3DKMT_HANDLE adapter, LUID adapter_luid, uint32_t node_id) + : adapter_(adapter), adapter_luid_(adapter_luid), node_id_(node_id) { + memset(&device_info_, 0, sizeof(device_info_)); + + ParseDeviceInfo(); + CreateDevice(); + SetPowerOptimization(false); + CreatePagingQueue(); + InitCmdbufInfo(); + QuerySegmentInfo(); +} + +WDDMDevice::~WDDMDevice() { + DestroyPagingQueue(); + SetPowerOptimization(true); + DestroyDevice(); + + DestroyDeviceInfo(); +} + +static NTSTATUS WDDMQueryAdapter(D3DKMT_HANDLE adapter, KMTQUERYADAPTERINFOTYPE type, + void *data, int size) +{ + D3DKMT_QUERYADAPTERINFO args = {0}; + + args.hAdapter = adapter; + args.Type = type; + args.pPrivateDriverData = data; + args.PrivateDriverDataSize = size; + + return DXCORE_CALL(D3DKMTQueryAdapterInfo(&args)); +} + +bool WDDMDevice::QuerySegmentInfo() +{ + uint32_t segmentCount = 0; + segment_infos_.clear(); + + // Get the number of segments + D3DKMT_QUERYSTATISTICS adapterQuery = {}; + adapterQuery.Type = D3DKMT_QUERYSTATISTICS_ADAPTER; + adapterQuery.AdapterLuid = adapter_luid_; + + NTSTATUS ret = DXCORE_CALL(D3DKMTQueryStatistics(&adapterQuery)); + if (ret == STATUS_SUCCESS) { + segmentCount = adapterQuery.QueryResult.AdapterInformation.NbSegments; + pr_debug("Total Segments: %u\n", segmentCount); + } else { + pr_err("Failed to query adapter info\n"); + return false; + } + + for (uint32_t i = 0; i < segmentCount; i++) { + + D3DKMT_QUERYSTATISTICS segQuery = {}; + segQuery.Type = D3DKMT_QUERYSTATISTICS_SEGMENT; + segQuery.AdapterLuid = adapter_luid_; + segQuery.QuerySegment.SegmentId = i; + + ret = DXCORE_CALL(D3DKMTQueryStatistics(&segQuery)); + if (ret != STATUS_SUCCESS) { + pr_err("Failed to query segment %u info\n", i); + return false; + } + + auto& seg = segQuery.QueryResult.SegmentInformation; + + SegmentInfo info; + info.segment_id = i; + info.segment_type = seg.SegmentProperties.SegmentType; + info.system_memory = seg.SegmentProperties.SystemMemory; + info.aperture = seg.Aperture; + info.commit_limit = seg.CommitLimit; + + segment_infos_.push_back(info); + } + + return true; +} + +bool WDDMDevice::GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE segment_type, + uint32_t &segment_id) +{ + for (const auto& seg_info : segment_infos_) { + if (seg_info.segment_type == segment_type) { + segment_id = seg_info.segment_id; + return true; + } + } + pr_err("Failed to get segment id for type %u\n", segment_type); + return false; +} + +/*Local heap(dedicated GPU memory) includes visiable heap and invisiable heap. + *Non local heap refers to shared GPU memory and it is sytem memory. + */ +uint64_t WDDMDevice::VramAvail(void) { + D3DKMT_QUERYSTATISTICS stats; + NTSTATUS ret; + uint64_t usedVis = 0; + uint64_t usedInv = 0; + uint64_t usedNonLocal = 0; + uint32_t segmentId = 0; + + // wait fence complete + uint64_t value = page_fence_value_.load(); + if(!CpuWait(&page_syncobj_, &value, 1, false)) + return HSA_STATUS_ERROR; + + if (IsDgpu()) { + // local cpu-visible memory + if(!GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE_MEMORY, segmentId)) + return HSA_STATUS_ERROR; + + memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS)); + stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT; + stats.AdapterLuid = adapter_luid_; + stats.QuerySegment.SegmentId = segmentId; + ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats)); + if (ret == 0) + usedVis = stats.QueryResult.SegmentInformation.BytesResident; + + // local invisible memory + if (device_info_.local_invisible_heap_size) { + segmentId++; + memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS)); + stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT; + stats.AdapterLuid = adapter_luid_; + stats.QuerySegment.SegmentId = 1; + + ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats)); + if (ret == 0) + usedInv = stats.QueryResult.SegmentInformation.BytesResident; + } + + return LocalHeapSize() - usedVis - usedInv; + } else { + // APU - NonLocal memory + if(!GetSegmentId(D3DKMT_QUERYSTATISTICS_SEGMENT_TYPE_SYSMEM, segmentId)) + return HSA_STATUS_ERROR; + + memset(&stats, 0, sizeof(D3DKMT_QUERYSTATISTICS)); + stats.Type = D3DKMT_QUERYSTATISTICS_SEGMENT; + stats.AdapterLuid = adapter_luid_; + stats.QuerySegment.SegmentId = segmentId; + ret = DXCORE_CALL(D3DKMTQueryStatistics(&stats)); + if (ret == 0) + usedNonLocal = stats.QueryResult.SegmentInformation.BytesResident; + + return NonLocalHeapSize() - usedNonLocal; + } +} + +bool WDDMDevice::CreateDevice(void) { + D3DKMT_CREATEDEVICE args = {0}; + args.hAdapter = adapter_; + + NTSTATUS ret = DXCORE_CALL(D3DKMTCreateDevice(&args)); + if (ret == STATUS_SUCCESS) { + device_ = args.hDevice; + return true; + } + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::DestroyDevice(void) { + D3DKMT_DESTROYDEVICE args = {0}; + args.hDevice = device_; + + NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyDevice(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::CreatePagingQueue(void) { + D3DKMT_CREATEPAGINGQUEUE args = {0}; + args.hDevice = device_; + args.Priority = D3DDDI_PAGINGQUEUE_PRIORITY_NORMAL; + + NTSTATUS ret = DXCORE_CALL(D3DKMTCreatePagingQueue(&args)); + if (ret == STATUS_SUCCESS) { + page_queue_ = args.hPagingQueue; + page_syncobj_ = args.hSyncObject; + page_fence_addr_ = (uint64_t *)args.FenceValueCPUVirtualAddress; + page_fence_value_ = 0; + return true; + } + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::DestroyPagingQueue(void) { + D3DDDI_DESTROYPAGINGQUEUE args = {0}; + args.hPagingQueue = page_queue_; + + NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyPagingQueue(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +void WDDMDevice::SetPowerOptimization(bool restore) { + void *priv_data; + int priv_size; + + priv_size = thunk_proxy::GetPowerOptPrivDataSize(); + priv_data = malloc(priv_size); + assert(priv_data); + memset(priv_data, 0, priv_size); + thunk_proxy::FillinPowerOptPrivData(priv_data, restore); + + D3DKMT_ESCAPE d3dkmt_escape; + memset(&d3dkmt_escape, 0, sizeof(d3dkmt_escape)); + + d3dkmt_escape.hAdapter = adapter_; + d3dkmt_escape.hDevice = device_; + d3dkmt_escape.hContext = 0; //KMD only use device to identify the process + d3dkmt_escape.Type = D3DKMT_ESCAPE_DRIVERPRIVATE; + d3dkmt_escape.pPrivateDriverData = priv_data; + d3dkmt_escape.PrivateDriverDataSize = priv_size; + d3dkmt_escape.Flags.HardwareAccess = true; + + NTSTATUS status = DXCORE_CALL(D3DKMTEscape(&d3dkmt_escape)); + pr_debug("status %d, restore %d\n", status, restore); + free(priv_data); +} + +void WDDMDevice::UpdatePageFence(uint64_t fence_value) { + uint64_t current = page_fence_value_.load(); + + // atomically set fence value when target is bigger than current one + do { + if (current >= fence_value) + break; + } while (!page_fence_value_.compare_exchange_weak(current, fence_value)); +} + +ErrorCode WDDMDevice::CreateGpuMemory(const GpuMemoryCreateInfo &create_info, + GpuMemory **gpu_mem, gpusize *gpu_va) { + ErrorCode ret; + + *gpu_mem = nullptr; + auto mem = new GpuMemory(this); + if (create_info.dmabuf_fd > 0) + ret = mem->ImportPhysicalHandle(create_info, gpu_va); + else + ret = mem->Init(create_info); + if (ret == ErrorCode::Success) + *gpu_mem = mem; + else + delete mem; + + return ret; +} + +void *WDDMDevice::Lock(D3DKMT_HANDLE handle) { + D3DKMT_LOCK2 args = {0}; + args.hDevice = device_; + args.hAllocation = handle; + + NTSTATUS ret = DXCORE_CALL(D3DKMTLock2(&args)); + if (ret == STATUS_SUCCESS) + return args.pData; + + pr_err("fail %x\n", ret); + return NULL; +} + +bool WDDMDevice::Unlock(D3DKMT_HANDLE handle) { + D3DKMT_UNLOCK2 args = {0}; + args.hDevice = device_; + args.hAllocation = handle; + + NTSTATUS ret = DXCORE_CALL(D3DKMTUnlock2(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::CreateContext(int engine, D3DKMT_HANDLE *handle) { + void *priv_data; + int priv_size; + + int ordinal = EngineOrdinal(engine, &device_info_); + if (ordinal < 0) + return false; + + priv_size = thunk_proxy::GetContextPrivDataSize(); + priv_data = malloc(priv_size); + assert(priv_data); + memset(priv_data, 0, priv_size); + thunk_proxy::FillinContextPrivData(priv_data, SupportStateShadowingByCpFw()); + + D3DKMT_CREATECONTEXTVIRTUAL args = {0}; + args.hDevice = device_; + args.EngineAffinity = 1 << 0; + args.NodeOrdinal = ordinal; + args.pPrivateDriverData = priv_data; + args.PrivateDriverDataSize = priv_size; + args.ClientHint = D3DKMT_CLIENTHINT_OPENCL; + + if (IsHwsEnabled(engine)) + args.Flags.HwQueueSupported = 1; + else + args.Flags.DisableGpuTimeout = thunk_proxy::ShouldDisableGpuTimeout(engine, &device_info_); + + NTSTATUS ret = DXCORE_CALL(D3DKMTCreateContextVirtual(&args)); + if (ret == STATUS_SUCCESS) { + *handle = args.hContext; + free(priv_data); + return true; + } + + free(priv_data); + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::DestroyContext(D3DKMT_HANDLE handle) { + D3DKMT_DESTROYCONTEXT args = {0}; + args.hContext = handle; + + NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyContext(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::GpuWait(WDDMQueue *queue, const D3DKMT_HANDLE *syncobjs, + uint64_t *values, int count) { + + D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMGPU args = {0}; + args.hContext = queue->context; + args.ObjectCount = count; + args.ObjectHandleArray = syncobjs; + args.MonitoredFenceValueArray = values; + + NTSTATUS ret = DXCORE_CALL(D3DKMTWaitForSynchronizationObjectFromGpu(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::GpuSignal(D3DKMT_HANDLE context, const D3DKMT_HANDLE *syncobjs, + uint64_t *value, int count) { + D3DKMT_SIGNALSYNCHRONIZATIONOBJECTFROMGPU args = {0}; + args.hContext = context; + args.ObjectCount = count; + args.ObjectHandleArray = syncobjs; + args.MonitoredFenceValueArray = value; + + NTSTATUS ret = DXCORE_CALL(D3DKMTSignalSynchronizationObjectFromGpu(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::CpuWait(const D3DKMT_HANDLE *syncobjs, uint64_t *value, + int count, bool wait_any) { + D3DKMT_WAITFORSYNCHRONIZATIONOBJECTFROMCPU args = {0}; + args.hDevice = device_; + args.ObjectCount = count; + args.ObjectHandleArray = syncobjs; + args.FenceValueArray = value; + args.Flags.WaitAny = wait_any; + + NTSTATUS ret = DXCORE_CALL(D3DKMTWaitForSynchronizationObjectFromCpu(&args)); + if (ret == STATUS_SUCCESS) + return true; + + pr_err("fail %x\n", ret); + return false; +} + +bool WDDMDevice::WaitOnPagingFenceFromCpu() { + uint64_t page_fence_value = 0; + + page_fence_value = page_fence_value_.load(); + if (CpuWait(&page_syncobj_, &page_fence_value, 1, false)) + return true; + + return false; +} + +bool WDDMDevice::CreateSyncobj(D3DKMT_HANDLE *handle, uint64_t **addr) { + D3DKMT_CREATESYNCHRONIZATIONOBJECT2 args = {0}; + args.hDevice = device_; + args.Info.Type = D3DDDI_MONITORED_FENCE; + args.Info.MonitoredFence.EngineAffinity = 1 << 0; + + NTSTATUS ret = DXCORE_CALL(D3DKMTCreateSynchronizationObject2(&args)); + if (ret == STATUS_SUCCESS) { + *handle = args.hSyncObject; + *addr = (uint64_t *)args.Info.MonitoredFence.FenceValueCPUVirtualAddress; + pr_debug("create syncobj cpu addr=%p gpu addr=%" PRIx64 "\n", + args.Info.MonitoredFence.FenceValueCPUVirtualAddress, + args.Info.MonitoredFence.FenceValueGPUVirtualAddress); + + return true; + } + + pr_err("fail %x\n", ret); + return false; +} + +void WDDMDevice::DestroySyncobj(D3DKMT_HANDLE handle) { + D3DKMT_DESTROYSYNCHRONIZATIONOBJECT args = {0}; + args.hSyncObject = handle; + + NTSTATUS ret = DXCORE_CALL(D3DKMTDestroySynchronizationObject(&args)); + if (ret != STATUS_SUCCESS) + pr_err("fail %x\n", ret); +} + +void WDDMDevice::InitCmdbufInfo(void) { + if (device_info_.major == 9) { + cmdbuf_aql_frame_size_ = 2 * sizeof(gfx9::AcquireMemTemplate); + } else if (device_info_.major >= 10) { + cmdbuf_aql_frame_size_ = 2 * sizeof(gfx10::AcquireMemTemplate); + } + + if (device_info_.major >= 11) { + cmdbuf_aql_frame_size_ += sizeof(SetScratchTemplate); + cmdbuf_aql_frame_size_ += sizeof(DispatchProgramResourceRegs); // BuildComputeShaderParams + } + + cmdbuf_aql_frame_size_ += + sizeof(PM4MEC_COPY_DATA) * 2 + + sizeof(BarrierTemplate) * 2 + + sizeof(DispatchTemplate) + + sizeof(AtomicTemplate) * 2; + + // Add safety margin to account for alignment and future additions + cmdbuf_aql_frame_size_ += 128; + + cmdbuf_aql_frame_size_ = AlignUp(cmdbuf_aql_frame_size_, 0x10); + + cmdbuf_size_ = AlignUp(cmdbuf_aql_frame_num_ * cmdbuf_aql_frame_size_, 0x1000); +} + +uint32_t WDDMDevice::LdsBlocks(const hsa_kernel_dispatch_packet_t *pkt) { + static const uint32_t blk_sz = 512; + uint32_t total_sz = pkt->group_segment_size; + uint32_t blk_num = (total_sz + blk_sz - 1) / blk_sz; + return blk_num; +} + +NTSTATUS WDDMCreateDevices(std::vector &devices) +{ + bool supported = false; + D3DKMT_ENUMADAPTERS2 args = {0}; + NTSTATUS ret = DXCORE_CALL(D3DKMTEnumAdapters2(&args)); + if (ret != STATUS_SUCCESS) + return ret; + + if (!args.NumAdapters) { + return STATUS_SUCCESS; + } + + D3DKMT_ADAPTERINFO *info = new D3DKMT_ADAPTERINFO[args.NumAdapters]; + if (!info) + return STATUS_NO_MEMORY; + + args.pAdapters = info; + ret = DXCORE_CALL(D3DKMTEnumAdapters2(&args)); + if (ret != STATUS_SUCCESS) + goto err_out0; + + for (int i = 0; i < args.NumAdapters; i++) { + D3DKMT_QUERY_DEVICE_IDS query = {0}; + + ret = WDDMQueryAdapter(info[i].hAdapter, KMTQAITYPE_PHYSICALADAPTERDEVICEIDS, + &query, sizeof(query)); + if (ret != STATUS_SUCCESS) + goto err_out1; + + if (query.DeviceIds.VendorID != 0x1002) + continue; + + supported = thunk_proxy::QueryAdapterSupported(query.DeviceIds.DeviceID); + + if (supported) { + auto device = new WDDMDevice( + info[i].hAdapter, info[i].AdapterLuid, devices.size() + 1); + if (!device) + goto err_out1; + devices.push_back(device); + } + } + + delete[] info; + return STATUS_SUCCESS; + + err_out1: + for (auto &device : devices) + delete device; + err_out0: + delete[] info; + return ret; +} + +bool WDDMDevice::ParseDeviceInfo() { + bool ret; + + memset(&device_info_, 0, sizeof(device_info_)); + ret = thunk_proxy::ParseAdapterInfo(adapter_, &device_info_); + if (!ret) + return false; + + return true; +} + +void WDDMDevice::DestroyDeviceInfo() { + free(device_info_.adapter_info); +} + +void WDDMDevice::GetClockCounters(uint64_t *gpu, uint64_t *cpu) { + + uint32_t engine = GetComputeEngine(); + int ordinal = EngineOrdinal(engine, &device_info_); + + D3DKMT_QUERYCLOCKCALIBRATION args = {0}; + + /* LDA(Linked Display Adapter) + * In the LDA design multiple physical GPUs are linked together to be controlled + * as a single object from the point of view of power manager, GPU scheduler and + * GPU memory manager. The physical GPUs are represented by a signal logical adapter + * object. There is a single DXGADAPTER objects, a single KMD adapter object. + * + * Set PhysicalAdapterIndex to 0 by default with None LDA mode. + */ + args.hAdapter = adapter_; + args.NodeOrdinal = ordinal; + args.PhysicalAdapterIndex = 0; + + NTSTATUS status = DXCORE_CALL(D3DKMTQueryClockCalibration(&args)); + if (status) { + pr_debug("status %d \n", status); + } else { + if (gpu) + *gpu = args.ClockData.GpuClockCounter; + + if (cpu) + *cpu = args.ClockData.CpuClockCounter; + } +} + +bool WDDMDevice::CreateQueue(WDDMQueue *queue) { + if (!CreateContext(queue->queue_engine, &queue->context)) + return false; + + GpuMemory *gpu_mem = nullptr; + if (queue->cmdbuf_addr == 0) { + GpuMemoryCreateInfo create_info{}; + create_info.size = queue->cmdbuf_size; + create_info.domain = thunk_proxy::kSystem; + + auto code = CreateGpuMemory(create_info, &gpu_mem); + if (code != ErrorCode::Success) + goto err_out0; + + queue->cmdbuf = gpu_mem->GetGpuMemoryHandle(); + queue->cmdbuf_addr = gpu_mem->GpuAddress(); + } + + if (queue->Init()) + goto err_out1; + + return true; + +err_out1: + delete gpu_mem; +err_out0: + DestroyContext(queue->context); + + return false; +} + +void WDDMDevice::DestroyQueue(WDDMQueue *queue) { + + queue->Fini(); + + auto cmdbuf_mem = GpuMemory::Convert(queue->cmdbuf); + delete cmdbuf_mem; + + DestroyContext(queue->context); +} + +bool WDDMDevice::SubmitToSwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value) { + void *priv_data; + int priv_size; + + priv_size = thunk_proxy::GetSubmitPrivDataSize(); + priv_data = malloc(priv_size); + assert(priv_data); + memset(priv_data, 0, priv_size); + thunk_proxy::FillinSubmitPrivData(priv_data, queue->queue, command_addr, command_size, false); + + D3DKMT_SUBMITCOMMAND args = {0}; + args.Commands = command_addr; + args.CommandLength = command_size; + args.BroadcastContextCount = 1; + args.BroadcastContext[0] = queue->context; + args.pPrivateDriverData = priv_data; + args.PrivateDriverDataSize = priv_size; + + NTSTATUS ret = DXCORE_CALL(D3DKMTSubmitCommand(&args)); + if (ret != STATUS_SUCCESS) { + pr_err("fail %x\n", ret); + free(priv_data); + return false; + } + + free(priv_data); + + if (!GpuSignal(queue->context, &queue->syncobj, &fence_value, 1)) + return false; + + return true; +} + +bool WDDMDevice::CreateHwQueue(WDDMQueue *queue) { + void *priv_data; + int priv_size; + + priv_size = thunk_proxy::GetHwQueuePrivDataSize(); + priv_data = malloc(priv_size); + assert(priv_data); + memset(priv_data, 0, priv_size); + bool FwManagedGfxState = SupportStateShadowingByCpFw(); + thunk_proxy::FillinHwQueuePrivData(priv_data, FwManagedGfxState, queue->prio); + + D3DKMT_CREATEHWQUEUE createHwQueue = {0}; + createHwQueue.hHwContext = queue->context; + createHwQueue.Flags.DisableGpuTimeout = thunk_proxy::ShouldDisableGpuTimeout(queue->queue_engine, &device_info_); + createHwQueue.pPrivateDriverData = priv_data; + createHwQueue.PrivateDriverDataSize = priv_size; + + NTSTATUS ret = DXCORE_CALL(D3DKMTCreateHwQueue(&createHwQueue)); + if (ret != STATUS_SUCCESS) { + pr_err("fail %x\n", ret); + free(priv_data); + return false; + } + + free(priv_data); + + queue->queue = createHwQueue.hHwQueue; + queue->syncobj = createHwQueue.hHwQueueProgressFence; + queue->sync_addr = (uint64_t *)createHwQueue.HwQueueProgressFenceCPUVirtualAddress; + + return true; +} + +bool WDDMDevice::DestroyHwQueue(WDDMQueue *queue) { + D3DKMT_DESTROYHWQUEUE DestroyHwQueue = { + .hHwQueue = queue->queue, + }; + + NTSTATUS ret = DXCORE_CALL(D3DKMTDestroyHwQueue(&DestroyHwQueue)); + if (ret != STATUS_SUCCESS) { + pr_err("fail %x\n", ret); + return false; + } + + return true; +} + +bool WDDMDevice::SubmitToHwQueue(WDDMQueue *queue, uint64_t command_addr, + uint64_t command_size, uint64_t fence_value) { + void *priv_data; + int priv_size; + + priv_size = thunk_proxy::GetSubmitPrivDataSize(); + priv_data = malloc(priv_size); + assert(priv_data); + memset(priv_data, 0, priv_size); + thunk_proxy::FillinSubmitPrivData(priv_data, queue->queue, command_addr, command_size, true); + + D3DKMT_SUBMITCOMMANDTOHWQUEUE args = {0}; + args.hHwQueue = queue->queue; + args.HwQueueProgressFenceId = fence_value; + args.CommandBuffer = command_addr; + args.CommandLength = command_size; + args.pPrivateDriverData = priv_data; + args.PrivateDriverDataSize = priv_size; + + NTSTATUS ret = DXCORE_CALL(D3DKMTSubmitCommandToHwQueue(&args)); + if (ret != STATUS_SUCCESS) { + pr_err("fail %x\n", ret); + free(priv_data); + return false; + } + + free(priv_data); + + return true; +} + +} // namespace thunk +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/gpu_memory.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/gpu_memory.cpp new file mode 100644 index 0000000000..e374be8867 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/gpu_memory.cpp @@ -0,0 +1,594 @@ +#include +#include +#include +#include "impl/wddm/gpu_memory.h" +#include "impl/wddm/device.h" +#include "util/utils.h" + +using namespace std; + +namespace wsl { +namespace thunk { + +size_t GpuMemory::CalcChunkNumbers(gpusize size) { + const auto chunk_size = WDDMDevice::GpuMemoryChunkSize; + return (size + chunk_size - 1) / chunk_size; +} + +gpusize GpuMemory::AdjustSize(gpusize size) const { + const auto &device_info = device_->DeviceInfo(); + + if (device_info.enable_big_page_alignment && desc_.domain == thunk_proxy::kLocal) { + uint32_t alignment = device_info.big_page_alignment_size; + // BigPage is only supported for allocations > bigPageMinAlignment. + // Also, if bigPageMinAlignment == 0, BigPage optimization is not supported per KMD. + // We do either LargePage or BigPage alignment, whichever has a higher value. + if ((device_info.hw_big_page_min_alignment_size > 0) && (size > device_info.hw_big_page_min_alignment_size)) { + alignment = std::max(alignment, device_info.hw_big_page_min_alignment_size); + if (size > device_info.hw_big_page_alignment_size) + alignment = std::max(alignment, device_info.hw_big_page_alignment_size); + } + if (alignment > 0) + size = AlignUp(size, alignment); + } else { + const size_t min_size = 4096; + size = AlignUp(size, min_size); + } + return size; +} + +GpuMemory::GpuMemory(WDDMDevice *device) : device_(device) { + num_allocations_ = 0; + alloc_handles_ptr_ = nullptr; + alloc_handle_ = 0; + resource_ = 0; + mem_fd_ = -1; +} + +GpuMemory::~GpuMemory() { + FreeGpuVirtualAddress(GpuAddress(), Size()); + FreePhysicalMemory(); + if (desc_.handle_ape_addr > 0) + dxg_runtime->HandleApertureFree(desc_.handle_ape_addr); +} + +ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) { + desc_.domain = create_info.domain; + desc_.adapter_luid = device_->GetLuid(); + desc_.client_size = create_info.size; + desc_.alignment = create_info.alignment; + desc_.mem_flags = create_info.mem_flags; + desc_.engine_flag = create_info.engine_flag; + desc_.flags.is_virtual = create_info.flags.virtual_alloc; + desc_.flags.is_physical_only = create_info.flags.physical_only; + desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous; + desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer; + desc_.flags.is_sysmem_exporter = create_info.flags.sysmem_ipc_sig_exporter; + desc_.flags.is_va_required = create_info.flags.alloc_va; + desc_.flags.is_blit_kernel_object = create_info.flags.blit_kernel_object; + + /* we can't tell the allocation is regular vmm or ipc mem at creation stage, + they share same creation parameters, so forcing all vram allocations to + sharable to support IPC mem */ + if (create_info.flags.interprocess || + desc_.domain == thunk_proxy::AllocDomain::kLocal) + desc_.flags.is_shared = true; + + desc_.flags.is_locked = create_info.flags.locked; + desc_.size = AdjustSize(desc_.client_size); + + if (IsUserMemory() || IsSystem()) + desc_.cpu_addr = create_info.user_ptr; + + num_allocations_ = CalcChunkNumbers(Size()); + if (num_allocations_ == 1) + alloc_handles_ptr_ = &alloc_handle_; + else + alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_]; + + memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle)); + + auto code = ErrorCode::Success; + + if (IsPhysicalOnly()) { + code = CreatePhysicalMemory(); + if (code == ErrorCode::Success) + code = dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr); + return code; + } + + code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment); + if (IsVirtual() || (code != ErrorCode::Success)) + return code; + + bool physical_created = false; + + auto guard = MakeScopeGuard([this, &physical_created, &code]() { + if (code != ErrorCode::Success) { + + if (physical_created) { + FreePhysicalMemory(); + } + FreeGpuVirtualAddress(GpuAddress(), Size()); + } + }); + (void)guard; + + code = CreatePhysicalMemory(); + if (code != ErrorCode::Success) + return code; + + physical_created = true; + + code = MapGpuVirtualAddress(GpuAddress(), Size()); + if (code != ErrorCode::Success) + return code; + + code = MakeResident(); + if (code != ErrorCode::Success) + return code; + + if (!GetDevice()->WaitOnPagingFenceFromCpu()) + code = ErrorCode::Unknown; + + return code; +} + +ErrorCode GpuMemory::UnmapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) { + auto code = ErrorCode::Success; + size_t i = 0; + auto map_addr = addr; + auto map_size = size; + + while (offset >= WDDMDevice::GpuMemoryChunkSize) { + offset -= WDDMDevice::GpuMemoryChunkSize; + i += 1; + } + + while (map_size > 0) { + auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize); + + D3DDDI_MAPGPUVIRTUALADDRESS args{}; + + args.hPagingQueue = device_->PagingQueue(); + args.BaseAddress = map_addr; + args.hAllocation = GetAllocationHandle(i); + args.SizeInPages = block_size / 0x1000; + args.Protection.NoAccess = 1; + + code = d3dthunk::MapGpuVirtualAddress(&args); + + if (code == ErrorCode::NotReady) + device_->UpdatePageFence(args.PagingFenceValue); + else if (code != ErrorCode::Success) + break; + + map_addr += block_size; + map_size -= block_size; + offset = 0; // reset second unmapped allocation offset to zero + i += 1; + } + + return code; +} + +ErrorCode GpuMemory::MapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) { + + auto code = ErrorCode::Success; + size_t i = 0; + auto map_addr = addr; + auto map_size = size; + const size_t _4K = 0x1000; + + while (offset >= WDDMDevice::GpuMemoryChunkSize) { + offset -= WDDMDevice::GpuMemoryChunkSize; + i += 1; + } + const size_t first_chunk = i; + const auto first_chunk_offset = offset; + /* Found two limitation for local vram: + * 1. invisible vram va has to be 64K aligned, otherwise map gpu va fail + * 2. visible vram can not be cpu mapped when command submission or after gpu mapped + */ + while (map_size > 0) { + auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize); + + D3DDDI_MAPGPUVIRTUALADDRESS args{}; + + args.hPagingQueue = device_->PagingQueue(); + args.BaseAddress = map_addr; + args.hAllocation = GetAllocationHandle(i); + args.OffsetInPages = offset / _4K; + args.SizeInPages = block_size / _4K; + args.Protection.Write = 1; + + code = d3dthunk::MapGpuVirtualAddress(&args); + + if (code != ErrorCode::Success) { + if (code == ErrorCode::NotReady) { + const uint64_t fence_value = args.PagingFenceValue; + device_->UpdatePageFence(fence_value); + code = ErrorCode::Success; + } else + break; + } + + map_addr += block_size; + map_size -= block_size; + offset = 0; // reset second mapped allocation offset to zero + i++; + } + + if (code != ErrorCode::Success) { + // Map failed, unmap partial mapped block + offset = first_chunk_offset; + map_addr = addr; + map_size = size; + for (size_t j = first_chunk; j < i; j++) { + auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize); + + D3DDDI_MAPGPUVIRTUALADDRESS args{}; + + args.hPagingQueue = device_->PagingQueue(); + args.BaseAddress = map_addr; + args.hAllocation = 0; + args.OffsetInPages = offset / _4K; + args.SizeInPages = block_size / _4K; + args.Protection.NoAccess = 1; + + auto unmap_code = d3dthunk::MapGpuVirtualAddress(&args); + if (unmap_code == ErrorCode::NotReady) + device_->UpdatePageFence(args.PagingFenceValue); + + map_addr += block_size; + map_size -= block_size; + } + } + + return code; +} + +ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize size, gpusize alignment) { + ErrorCode status; + gpusize gpu_virt_addr = 0; + if ((desc_.flags.is_sysmem_exporter || desc_.flags.is_imported_sys_memfd) + && desc_.domain == thunk_proxy::AllocDomain::kSystem) { + int mfd = (mem_fd_ > -1)? mem_fd_ : -1; + status = dxg_runtime->ReserveIPCSysMem(Size(), &gpu_virt_addr, desc_.alignment, mfd, desc_.flags.is_locked); + if (status == ErrorCode::Success) + mem_fd_ = mfd; + } else { + status = dxg_runtime->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment, + desc_.flags.is_locked); + } + + if (status == ErrorCode::Success) { + desc_.gpu_addr = gpu_virt_addr; + + if (IsSystem()) + desc_.cpu_addr = reinterpret_cast(desc_.gpu_addr); + } + return status; +} + +ErrorCode GpuMemory::FreeGpuVirtualAddress(gpusize base_addr, gpusize size) { + if (mem_fd_ > -1) + return dxg_runtime->FreeIPCSysMem(GpuAddress(), Size(), mem_fd_); + + return base_addr != 0 ? + dxg_runtime->FreeGpuVirtualAddress(desc_.domain, base_addr, size) : + ErrorCode::Success; +} + +ErrorCode GpuMemory::CreatePhysicalMemory() { + + assert(!IsVirtual() && NumChunks() > 0); + + const auto num_allocations = NumChunks(); + void *priv_drv_data; + void *priv_alloc_data; + int priv_drv_data_size; + int priv_alloc_data_size; + + thunk_proxy::GetAllocPrivDataSize(&priv_drv_data_size, &priv_alloc_data_size); + int total_size = priv_drv_data_size + + num_allocations * priv_alloc_data_size + + num_allocations * sizeof(D3DDDI_ALLOCATIONINFO2); + priv_drv_data = malloc(total_size); + if (!priv_drv_data) + return ErrorCode::OutOfMemory; + + memset(priv_drv_data, 0, total_size); + thunk_proxy::FillinAllocPrivDrvData(priv_drv_data, priv_alloc_data_size); + + priv_alloc_data = static_cast(priv_drv_data) + priv_drv_data_size; + auto alloc_info = reinterpret_cast( + static_cast(priv_alloc_data) + priv_alloc_data_size * num_allocations); + + size_t size = desc_.size; + uint64_t addr = desc_.gpu_addr; + char *cpu_addr = static_cast(desc_.cpu_addr); + const auto &device_info = GetDevice()->DeviceInfo(); + + for (size_t i = 0; i < num_allocations; i++) { + + void* priv_data = (void*)((char*)priv_alloc_data + priv_alloc_data_size * i); + size_t block_size = std::min(size, WDDMDevice::GpuMemoryChunkSize); + + if (IsUserMemory() || IsSystem()) { + thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, 0, desc_.mem_flags, desc_.engine_flag, device_info); + alloc_info[i].pSystemMem = static_cast(cpu_addr); + cpu_addr += block_size; + } else { + thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, addr, desc_.mem_flags, desc_.engine_flag, device_info); + } + + size -= block_size; + addr += block_size; + + alloc_info[i].pPrivateDriverData = priv_data; + alloc_info[i].PrivateDriverDataSize = priv_alloc_data_size; + alloc_info[i].VidPnSourceId = D3DDDI_ID_UNINITIALIZED; + } + + D3DKMT_CREATEALLOCATION args = {}; + args.hDevice = device_->DeviceHandle(); + args.pPrivateDriverData = priv_drv_data; + args.PrivateDriverDataSize = priv_drv_data_size; + args.NumAllocations = num_allocations; + args.pAllocationInfo2 = alloc_info; + + /* The PhysicallyContiguous flag causes allocation failure + * args.Flags.PhysicallyContiguous = IsPhysicalContiguous(); + */ + + SharedHandleInfo shared_info; + if (IsShared()) { + shared_info.size = desc_.size; + shared_info.client_size = desc_.client_size; + shared_info.domain = desc_.domain; + shared_info.adapter_luid = desc_.adapter_luid; + shared_info.flags = reinterpret_cast(desc_.flags.reserved); + shared_info.mem_flags = desc_.mem_flags; + shared_info.pid = dxg_runtime->parent_pid; + shared_info.gpu_addr = desc_.gpu_addr; + args.pPrivateRuntimeData = &shared_info; + args.PrivateRuntimeDataSize = sizeof(shared_info); + args.Flags.NtSecuritySharing = 1; + args.Flags.CreateShared = 1; + args.Flags.CreateResource = 1; + } + + auto status = d3dthunk::CreateAllocation(&args); + if (status == ErrorCode::Success) { + for (size_t i = 0; i < num_allocations; i++) + alloc_handles_ptr_[i] = alloc_info[i].hAllocation; + + resource_ = args.hResource; + } + free(priv_drv_data); + return status; +} + +ErrorCode GpuMemory::FreePhysicalMemory() { + auto code = ErrorCode::Success; + + if (alloc_handles_ptr_ == nullptr || (NumChunks() == 1 && *alloc_handles_ptr_ == 0)) + return code; + + code = d3dthunk::DestroyAllocation(device_->DeviceHandle(), + resource_, + NumChunks(), + alloc_handles_ptr_); + if (NumChunks() > 1) + delete[] alloc_handles_ptr_; + + alloc_handles_ptr_ = nullptr; + return code; +} + +ErrorCode GpuMemory::MakeResident() { + + D3DDDI_MAKERESIDENT args = {}; + args.hPagingQueue = device_->PagingQueue(); + args.NumAllocations = NumChunks(); + args.AllocationList = alloc_handles_ptr_; + args.Flags.CantTrimFurther = 1; + + auto code = d3dthunk::MakeResident(&args); + if (code == ErrorCode::NotReady) { + const auto fence_value = args.PagingFenceValue; + device_->UpdatePageFence(fence_value); + code = ErrorCode::Success; + } + return code; +} + +ErrorCode GpuMemory::Evict() { + + D3DKMT_EVICT args = {}; + args.hDevice = device_->DeviceHandle(); + args.NumAllocations = NumChunks(); + args.AllocationList = alloc_handles_ptr_; + + return d3dthunk::Evict(&args); +} + +ErrorCode GpuMemory::ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags) { + if (mem_fd_ > -1) { + *dmabuf_fd = mem_fd_; + return ErrorCode::Success; + } + + if (IsShared()) + return d3dthunk::ShareObjects(1, resource_, flags, dmabuf_fd); + else + return ErrorCode::UnSupported; +} + + +ErrorCode GpuMemory::ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info, gpusize *gpu_addr) { + D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE query_args; + int dmabuf_fd = create_info.dmabuf_fd; + + if (dmabuf_fd <= 0) + return ErrorCode::InvalidateParams; + + if(create_info.flags.sysmem_ipc_sig_importer) { + // the ipc signal sys mem fd will be closed in Runtime::IPCClientImport, dup to hold a reference + mem_fd_ = dup(dmabuf_fd); + desc_.client_size = create_info.size; + desc_.size = AdjustSize(desc_.client_size); + desc_.domain = thunk_proxy::AllocDomain::kSystem; + desc_.adapter_luid = device_->GetLuid(); + desc_.alignment = 0x1000; + desc_.mem_flags = create_info.mem_flags; + desc_.engine_flag = create_info.engine_flag; + desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer; + desc_.flags.is_va_required = create_info.flags.alloc_va; + desc_.flags.is_virtual = create_info.flags.virtual_alloc; + desc_.flags.is_physical_only = create_info.flags.physical_only; + desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous; + desc_.flags.is_locked = create_info.flags.locked; + + auto code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment); + if (code != ErrorCode::Success) + return code; + + bool physical_created = false; + auto guard = MakeScopeGuard([this, &physical_created, &code]() { + if (code != ErrorCode::Success) { + if (physical_created) + FreePhysicalMemory(); + FreeGpuVirtualAddress(GpuAddress(), Size()); + } + }); + (void)guard; + + num_allocations_ = CalcChunkNumbers(Size()); + if (num_allocations_ == 1) + alloc_handles_ptr_ = &alloc_handle_; + else + alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_]; + + memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle)); + + code = CreatePhysicalMemory(); + if (code != ErrorCode::Success) + return code; + + physical_created = true; + + code = MapGpuVirtualAddress(GpuAddress(), Size()); + if (code != ErrorCode::Success) + return code; + + code = MakeResident(); + if (code != ErrorCode::Success) + return code; + + if (!GetDevice()->WaitOnPagingFenceFromCpu()) + code = ErrorCode::Unknown; + + return code; + } else { + // vmem importer / ipc vram importer + memset(&query_args, 0, sizeof(query_args)); + query_args.hDevice = device_->DeviceHandle(); + query_args.hNtHandle = reinterpret_cast(dmabuf_fd); + auto ret = d3dthunk::QueryResourceInfoFromNtHandle(&query_args); + if (ret != ErrorCode::Success) { + pr_err("query resource info from nt handle failed %d\n", static_cast(ret)); + return ErrorCode::InvalidateParams; + } + pr_debug("wsl-thunk: import from nt handle %d, get allocation number %d," + " runtime data size %#x total driver data size %#x resource data size=%#x\n", + dmabuf_fd, + query_args.NumAllocations, + query_args.PrivateRuntimeDataSize, + query_args.TotalPrivateDriverDataSize, + query_args.ResourcePrivateDriverDataSize); + + SharedHandleInfo shared_info; + if(sizeof(shared_info) != query_args.PrivateRuntimeDataSize) { + pr_err("shared hanle info size mismatch:%d vs %ld\n", + query_args.PrivateRuntimeDataSize, sizeof(shared_info)); + return ErrorCode::UnSupported; + } + + uint32_t total_size = query_args.NumAllocations * sizeof(D3DDDI_OPENALLOCATIONINFO2) + + query_args.TotalPrivateDriverDataSize + + query_args.ResourcePrivateDriverDataSize; + D3DDDI_OPENALLOCATIONINFO2 *open_info = + reinterpret_cast (calloc(1, total_size)); + if (!open_info) { + pr_err("alloc open_info failed, NumAllocations:%d\n", + query_args.NumAllocations); + return ErrorCode::OutOfMemory; + } + + auto guard = MakeScopeGuard([&open_info]() { free(open_info); }); + + alloc_handles_ptr_ = new WinAllocationHandle[query_args.NumAllocations]; + + D3DKMT_OPENRESOURCEFROMNTHANDLE open_args; + memset(&open_args, 0, sizeof(open_args)); + open_args.hDevice = query_args.hDevice; + open_args.hNtHandle = query_args.hNtHandle; + open_args.NumAllocations = query_args.NumAllocations; + open_args.pOpenAllocationInfo2 = open_info; + open_args.TotalPrivateDriverDataBufferSize = query_args.TotalPrivateDriverDataSize; + open_args.pTotalPrivateDriverDataBuffer = reinterpret_cast + (open_args.pOpenAllocationInfo2 + open_args.NumAllocations); + open_args.ResourcePrivateDriverDataSize = query_args.ResourcePrivateDriverDataSize; + open_args.pResourcePrivateDriverData = reinterpret_cast + (((uint64_t)open_args.pTotalPrivateDriverDataBuffer) + + open_args.TotalPrivateDriverDataBufferSize); + open_args.PrivateRuntimeDataSize = query_args.PrivateRuntimeDataSize; + open_args.pPrivateRuntimeData = reinterpret_cast (&shared_info); + + ret = d3dthunk::OpenResourceFromNtHandle(&open_args); + if (ret != ErrorCode::Success) { + ret = ErrorCode::InvalidateParams; + pr_err("open resource failed %d\n", static_cast(ret)); + return ret; + } + if (shared_info.pid == dxg_runtime->parent_pid && + create_info.flags.alloc_va && + IsSameAdapter(shared_info.adapter_luid) && + shared_info.gpu_addr) { + pr_info("import from same device and samve process, va is required. " + "a buffer can't be mapped to 2 va. delete the imported buffer, use the existing one.\n"); + if (gpu_addr) + *gpu_addr = shared_info.gpu_addr; + return ErrorCode::SameProcessSameDevice; + } + + desc_.size = shared_info.size; + desc_.client_size = shared_info.client_size; + desc_.domain = shared_info.domain; + desc_.flags.reserved = shared_info.flags; + desc_.mem_flags = shared_info.mem_flags; + desc_.adapter_luid = shared_info.adapter_luid; + resource_ = open_args.hResource; + num_allocations_ = open_args.NumAllocations; + for (int i = 0; i < num_allocations_; i++) + alloc_handles_ptr_[i] = open_info[i].hAllocation; + + desc_.flags.is_va_required = create_info.flags.alloc_va; + if (desc_.flags.is_va_required) { + desc_.flags.is_imported_vram_ipc = 1; + ret = ReserveGpuVirtualAddress(create_info.va_hint, desc_.size, create_info.alignment); + if (ret != ErrorCode::Success) + pr_err("failed to allocate svm range, error:%d\n", static_cast(ret)); + + return ret; + } else { + desc_.flags.is_imported_vram_vmem = 1; + return dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr); + } + } +} + +} // namespace thunk +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/queue.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/queue.cpp new file mode 100644 index 0000000000..44658819cb --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/queue.cpp @@ -0,0 +1,1210 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include + +#include "impl/wddm/queue.h" +#include "impl/registers.h" + +#include "impl/hsa/hsa.h" +#include "impl/hsa/hsa_ven_amd_loader.h" +extern hsa_signal_value_t hsakmt_hsa_signal_load_relaxed(hsa_signal_t signal); +extern hsa_signal_value_t hsakmt_hsa_signal_wait_relaxed( + hsa_signal_t signal, hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); +extern void hsakmt_hsa_signal_store_screlease(hsa_signal_t hsa_signal, + hsa_signal_value_t value); +extern hsa_status_t hsakmt_hsa_ven_amd_loader_query_host_address( + const void *device_address, const void **host_address); + +namespace wsl { +namespace thunk { + +hsa_status_t WDDMQueue::SwsInit(void) { + if (!device->CreateSyncobj(&syncobj, &sync_addr)) + return HSA_STATUS_ERROR; + + if (device->AllocUserQueueMemFromUMD()) { + + GpuMemory *gpu_mem = nullptr; + GpuMemoryCreateInfo create_info{}; + + create_info.domain = thunk_proxy::kUserQueue; + create_info.size = device->GetSwsQueueSize(); + create_info.engine_flag = thunk_proxy::QueueEngine2EngineFlag(queue_engine); + + auto code = device->CreateGpuMemory(create_info, &gpu_mem); + if (code != ErrorCode::Success) { + device->DestroySyncobj(syncobj); + return HSA_STATUS_ERROR; + } + + queue_mem = gpu_mem->GetGpuMemoryHandle(); + queue = gpu_mem->GetAllocationHandle(0); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::SwsFini(void) { + device->DestroySyncobj(syncobj); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::SwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value) { + if (!device->SubmitToSwQueue(this, command_addr, command_size, fence_value)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::HwsInit(void) { + if (!device->CreateHwQueue(this)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::HwsFini(void) { + if (!device->DestroyHwQueue(this)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::HwsSubmit(uint64_t command_addr, + uint64_t command_size, + uint64_t fence_value) { + if (!device->SubmitToHwQueue(this, command_addr, command_size, fence_value)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t WDDMQueue::SetPriority(hsa_amd_queue_priority_t priority) { + if (!use_hws) + return HSA_STATUS_SUCCESS; + + thunk_proxy::SchedLevel new_prio = ConvertSchedLevel(priority); + if (prio == new_prio) + return HSA_STATUS_SUCCESS; + + pr_debug("set prio %d -> %d\n", prio, new_prio); + device->DestroyHwQueue(this); + + prio = new_prio; + return HwsInit(); +} + +void ComputeQueue::HandleError(hsa_status_t status) { + hsa_signal_t sig = amd_queue_rocr_->queue_inactive_signal; + hsa_signal_value_t val = -1; + + struct queue_error_t { + uint32_t code; + hsa_status_t status; + }; + static const queue_error_t QueueErrors[] = { + {2, HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS}, + {4, HSA_STATUS_ERROR_INVALID_ALLOCATION}, + {8, HSA_STATUS_ERROR_INVALID_CODE_OBJECT}, + //{16, HSA_STATUS_ERROR_INVALID_ARGUMENT}, + {32, HSA_STATUS_ERROR_INVALID_PACKET_FORMAT}, + {64, HSA_STATUS_ERROR_INVALID_ARGUMENT}, + //{128, HSA_STATUS_ERROR_OUT_OF_REGISTERS}, + //{0x20000000, HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION}, + //{0x40000000, HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION}, + {0x80000000, HSA_STATUS_ERROR_EXCEPTION}, + }; + for (std::size_t i = 0; i < sizeof(QueueErrors) / sizeof(QueueErrors[0]); ++i) { + if (QueueErrors[i].status == status) { + val = QueueErrors[i].code; + pr_err("error %d, sig_val %ld\n", status, val); + break; + } + } + + if (sig.handle) { + hsakmt_hsa_signal_store_screlease(sig, val); + } + if (error_code_) { + error_code_->store(val, std::memory_order_release); + } +} + +void ComputeQueue::AqlToPm4Thread(ComputeQueue *queue) { + + // This timing system is used for sleeping this Thread + // when one packet is invalid for about 2 seconds. + std::chrono::steady_clock::time_point start_time, time; + // Set the polling timeout value for 2 seconds + const std::chrono::milliseconds kMaxElapsed(2000); + uint64_t current_position = queue->GetAqlWriteIndex(); + bool sleep = false; + start_time = std::chrono::steady_clock::now(); + + while (true) { + if (!queue->IsInvalidPacket()) { + hsa_status_t status = queue->Process(); + if (status != HSA_STATUS_SUCCESS) { + pr_err("process compute queue fail status = %08x\n", status); + queue->HandleError(status); + break; + } + sleep = false; + } else { + if (current_position == queue->GetAqlWriteIndex()) { + time = std::chrono::steady_clock::now(); + if (time - start_time > kMaxElapsed) + sleep = true; + } else { + start_time = std::chrono::steady_clock::now(); + current_position = queue->GetAqlWriteIndex(); + sleep = false; + } + } + + if ((queue->GetRingWptr()->load() > queue->GetRingRptr()->load()) && !sleep) + continue; + + std::unique_lock lock(queue->thread_cond_lock_); + // CPU wait for valid packet + if (queue->GetRingWptr()->load() <= queue->GetRingRptr()->load() || + (sleep && queue->IsInvalidPacket())) { + if (queue->thread_stop_) + break; + pr_debug("wait %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n", + queue->ring, queue->GetRingWptr()->load(), queue->GetRingRptr()->load()); + queue->thread_cond_.wait(lock); + } + } + + pr_debug("aql to pm4 thread %p exit\n", queue->ring); +} + +ComputeQueue::ComputeQueue(WDDMDevice *device, + void *ring, + uint64_t ring_size, + std::atomic *ring_wptr, + std::atomic *ring_rptr, + volatile int64_t *error_addr, + uint32_t cmdbuf_size, + uint32_t engine, + bool use_hws) : + WDDMQueue(device, 0, cmdbuf_size, engine, use_hws), + ring(ring), + ring_size(ring_size), + ring_wptr(ring_wptr), + ring_rptr(ring_rptr), + error_code_(reinterpret_cast*>(error_addr)), + ib_start_addr(0), + ib_size(0), + sync_point(0), + cmdbuf_aql_frame_write_index(0), + cmdbuf_aql_frame_size(0), + needs_barrier(true), + ready_to_submit(false), + platform_atomic_support_(false), + signal_addr_(NULL), + thread_stop_(false), + max_scratch_waves_(device->MaxScratchSlotsPerCu() * device->ComputeUnitCount()), + dispatch_waves_(0), + scratch_size_per_wave_(0), + scratch_size_(0), + total_scratch_size_(0), + scratch_base_(nullptr) { + bool ret = device->CreateQueue(this); + assert(ret); + + GpuMemoryCreateInfo create_info{}; + create_info.size = dxg_runtime->page_size; + create_info.domain = thunk_proxy::kSystem; + GpuMemory *gpu_mem = nullptr; + auto code = device->CreateGpuMemory(create_info, &gpu_mem); + assert(code == ErrorCode::Success); + amd_queue_mem_ = gpu_mem->GetGpuMemoryHandle(); + amd_queue_ = reinterpret_cast(gpu_mem->GpuAddress()); + + amd_queue_rocr_ = (amd_queue_v2_t*)((char*)ring_rptr - offsetof(amd_queue_v2_t, read_dispatch_id)); + aql_to_pm4_thread_ = std::thread(AqlToPm4Thread, this); + + if (device->Major() >= 11) + scratch_mem_alignment_size_ = 256; + else + scratch_mem_alignment_size_ = 1024; +} + +ComputeQueue::~ComputeQueue() { + thread_cond_lock_.lock(); + thread_stop_ = true; + thread_cond_lock_.unlock(); + thread_cond_.notify_one(); + aql_to_pm4_thread_.join(); + + //doorbell_signal_->Release(); + + device->DestroyQueue(this); + + if (scratch_base_) { + auto scratch_gpu_mem = GpuMemory::Convert(scratch_mem_); + delete scratch_gpu_mem; + } + + auto amd_queue_gpu_mem = GpuMemory::Convert(amd_queue_mem_); + delete amd_queue_gpu_mem; +} + +void ComputeQueue::InitScratchSRD() { + // Populate scratch resource descriptor + SQ_BUF_RSRC_WORD0 srd0; + + uintptr_t scratch_base = uintptr_t(scratch_base_); + srd0.bits.BASE_ADDRESS = scratch_base; + + uint32_t srd1_u32; + + if (device->Major() < 11) { + SQ_BUF_RSRC_WORD1 srd1; + + srd1.bits.BASE_ADDRESS_HI = scratch_base >> 32; + srd1.bits.STRIDE = 0; + srd1.bits.CACHE_SWIZZLE = 0; + srd1.bits.SWIZZLE_ENABLE = 1; + + srd1_u32 = srd1.u32All; + } else { + SQ_BUF_RSRC_WORD1_GFX11 srd1; + + srd1.bits.BASE_ADDRESS_HI = scratch_base >> 32; + srd1.bits.STRIDE = 0; + srd1.bits.SWIZZLE_ENABLE = 1; + + srd1_u32 = srd1.u32All; + } + + SQ_BUF_RSRC_WORD2 srd2; + + srd2.bits.NUM_RECORDS = scratch_size_; + + uint32_t srd3_u32; + + if (device->Major() < 10) { + SQ_BUF_RSRC_WORD3 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.NUM_FORMAT = BUF_NUM_FORMAT_UINT; + srd3.bits.DATA_FORMAT = BUF_DATA_FORMAT_32; + srd3.bits.ELEMENT_SIZE = 1; // 4 + srd3.bits.INDEX_STRIDE = 3; // 64 + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.ATC__CI__VI = 0; + srd3.bits.HASH_ENABLE = 0; + srd3.bits.HEAP = 0; + srd3.bits.MTYPE__CI__VI = 0; + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } else if (device->Major() == 10) { + SQ_BUF_RSRC_WORD3_GFX10 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.FORMAT = BUF_FORMAT_32_UINT; + srd3.bits.RESERVED1 = 0; + srd3.bits.INDEX_STRIDE = 0; // filled in by CP + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.RESOURCE_LEVEL = 1; + srd3.bits.RESERVED2 = 0; + srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } else if (device->Major() == 11) { + SQ_BUF_RSRC_WORD3_GFX11 srd3; + + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.FORMAT = BUF_FORMAT_32_UINT; + srd3.bits.RESERVED1 = 0; + srd3.bits.INDEX_STRIDE = 0; // filled in by CP + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.RESERVED2 = 0; + srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } else { + SQ_BUF_RSRC_WORD3_GFX12 srd3; + srd3.bits.DST_SEL_X = SQ_SEL_X; + srd3.bits.DST_SEL_Y = SQ_SEL_Y; + srd3.bits.DST_SEL_Z = SQ_SEL_Z; + srd3.bits.DST_SEL_W = SQ_SEL_W; + srd3.bits.FORMAT = BUF_FORMAT_32_UINT; + srd3.bits.RESERVED1 = 0; + srd3.bits.INDEX_STRIDE = 0; // filled in by CP + srd3.bits.ADD_TID_ENABLE = 1; + srd3.bits.WRITE_COMPRESS_ENABLE = 0; + srd3.bits.COMPRESSION_EN = 0; + srd3.bits.COMPRESSION_ACCESS_MODE = 0; + srd3.bits.OOB_SELECT = 2; // no bounds check in swizzle mode + srd3.bits.TYPE = SQ_RSRC_BUF; + + srd3_u32 = srd3.u32All; + } + + // Update Queue's Scratch descriptor's property + amd_queue_->scratch_resource_descriptor[0] = srd0.u32All; + amd_queue_->scratch_resource_descriptor[1] = srd1_u32; + amd_queue_->scratch_resource_descriptor[2] = srd2.u32All; + amd_queue_->scratch_resource_descriptor[3] = srd3_u32; + + // Populate flat scratch parameters in amd_queue_. + amd_queue_->scratch_backing_memory_location = scratch_base; + + // For backwards compatibility this field records the per-lane scratch + // for a 64 lane wavefront. If scratch was allocated for 32 lane waves + // then the effective size for a 64 lane wave is halved. + amd_queue_->scratch_wave64_lane_byte_size = scratch_size_per_wave_ / 64; + + uint64_t num_waves; + if (device->Major() < 11) { + COMPUTE_TMPRING_SIZE tmpring_size; + // Scratch Size per Wave is specified in terms of scratch_mem_alignment_size_ + tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_; + num_waves = scratch_size_ / scratch_size_per_wave_; + tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_); + + amd_queue_->compute_tmpring_size = tmpring_size.u32All; + } else if (device->Major() == 11) { + COMPUTE_TMPRING_SIZE_GFX11 tmpring_size; + tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_; + // For GFX11 we specify number of waves per engine instead of total + num_waves = scratch_size_ / scratch_size_per_wave_ / device->NumShaderEngine(); + tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_); + + amd_queue_->compute_tmpring_size = tmpring_size.u32All; + } else { + COMPUTE_TMPRING_SIZE_GFX12 tmpring_size = {}; + tmpring_size.bits.WAVESIZE = scratch_size_per_wave_ / scratch_mem_alignment_size_; + // For GFX12 we specify number of waves per engine instead of total + num_waves = scratch_size_ / scratch_size_per_wave_ / device->NumShaderEngine(); + tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves_); + + amd_queue_->compute_tmpring_size = tmpring_size.u32All; + } + + return; +} + +uint64_t ComputeQueue::CalcDispatchGroups(hsa_kernel_dispatch_packet_t *packet) +{ + const uint64_t lanes_per_group = + (uint64_t(packet->workgroup_size_x) * packet->workgroup_size_y) * packet->workgroup_size_z; + + uint64_t groups = ((uint64_t(packet->grid_size_x) + packet->workgroup_size_x - 1) / + packet->workgroup_size_x) * + ((uint64_t(packet->grid_size_y) + packet->workgroup_size_y - 1) / + packet->workgroup_size_y) * + ((uint64_t(packet->grid_size_z) + packet->workgroup_size_z - 1) / + packet->workgroup_size_z); + const uint32_t cu_count = device->ComputeUnitCount(); + const uint32_t engines = device->NumShaderEngine(); + + const uint32_t symmetric_cus = AlignDown(cu_count, engines); + const uint32_t asymmetryPerRound = cu_count - symmetric_cus; + const uint64_t rounds = groups / cu_count; + const uint64_t asymmetricGroups = rounds * asymmetryPerRound; + const uint64_t symmetricGroups = groups - asymmetricGroups; + + uint64_t maxGroupsPerEngine = + ((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0); + + // For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each + // engine. + if (device->Major() >= 10 && + maxGroupsPerEngine < 16 && + lanes_per_group * maxGroupsPerEngine < 256) { + uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group; + maxGroupsPerEngine = std::min(groups_per_interleave, uint64_t(16ul)); + } + + return maxGroupsPerEngine * engines; +} + +uint64_t ComputeQueue::CalcDispatchWavesPerGroup(hsa_kernel_dispatch_packet_t *packet, + bool wave32) +{ + const uint32_t lanes_per_wave = wave32 ? 32 : 64; + + const uint64_t lanes_per_group = + (uint64_t(packet->workgroup_size_x) * packet->workgroup_size_y) * packet->workgroup_size_z; + + return (lanes_per_group + lanes_per_wave - 1) / lanes_per_wave; +} + +bool ComputeQueue::UpdateScratch(hsa_kernel_dispatch_packet_t *packet, bool wave32) { + const uint32_t lanes_per_wave = wave32 ? 32 : 64; + const uint64_t size_per_thread = AlignUp(packet->private_segment_size, + scratch_mem_alignment_size_ / lanes_per_wave); + + uint64_t groups = CalcDispatchGroups(packet); + uint64_t waves_per_group = CalcDispatchWavesPerGroup(packet, wave32); + + // For packet batching, the maximum value must be used to fit all packets. + scratch_size_per_wave_ = std::max(size_per_thread * lanes_per_wave, scratch_size_per_wave_); + dispatch_waves_ = std::max(groups * waves_per_group, dispatch_waves_); + + const uint64_t max_scratch_size = scratch_size_per_wave_ * max_scratch_waves_; + const uint64_t dispatch_size = scratch_size_per_wave_ * dispatch_waves_; + + scratch_size_ = std::min(dispatch_size, max_scratch_size); + + if (total_scratch_size_ >= scratch_size_) + return true; + + pr_debug("need realloc scratch buffer, size %x -> %x\n", + total_scratch_size_, scratch_size_); + + GpuMemoryCreateInfo create_info{}; + create_info.size = scratch_size_; + create_info.domain = thunk_proxy::kLocal; + GpuMemory *gpu_mem = nullptr; + auto code = device->CreateGpuMemory(create_info, &gpu_mem); + if (code != ErrorCode::Success) + return false; + + if (scratch_base_) { + auto scratch_gpu_mem = GpuMemory::Convert(scratch_mem_); + delete scratch_gpu_mem; + } + + total_scratch_size_ = scratch_size_; + scratch_base_ = reinterpret_cast(gpu_mem->GpuAddress()); + scratch_mem_ = gpu_mem->GetGpuMemoryHandle(); + + InitScratchSRD(); + return true; +} + +bool ComputeQueue::RelocateCmdbufScratchBase(uint64_t addr) { + if (scratch_base_offset_array_.empty()) + return true; + + for (size_t i = 0; i < scratch_base_offset_array_.size(); i++) { + uint32_t *p_compute_user_data = + reinterpret_cast(addr + scratch_base_offset_array_[i]); + if (device->Major() >= 11) { + p_compute_user_data[0] = Ptr48Low32(scratch_base_); + p_compute_user_data[1] = Ptr48High8(scratch_base_); + } else { + p_compute_user_data[0] = PtrLow32(scratch_base_); + p_compute_user_data[1] = (p_compute_user_data[1] & 0xffff0000) | PtrHigh32(scratch_base_); + } + } + scratch_base_offset_array_.clear(); + + return true; +} + +uint32_t ComputeQueue::UpdateIndexStride(uint32_t srd, bool wave32) { + + assert(device->Major() < 13); + + if (device->Major() == 10) { + SQ_BUF_RSRC_WORD3_GFX10 srd3; + + srd3.u32All = srd; + srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3; + + return srd3.u32All; + } else if (device->Major() == 11) { + SQ_BUF_RSRC_WORD3_GFX11 srd3; + + srd3.u32All = srd; + srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3; + + return srd3.u32All; + } else if (device->Major() == 12) { + SQ_BUF_RSRC_WORD3_GFX12 srd3; + + srd3.u32All = srd; + srd3.bits.INDEX_STRIDE = wave32 ? 2 : 3; + + return srd3.u32All; + } + + return srd; +} + +uint64_t ComputeQueue::GetKernelObjAddr(uint64_t addr) const { + /* convert dev_addr to host_addr */ + auto code = get_gpu_mem((void*)addr); + if (code && code->IsBlitKernelObject()) { + return code->GpuAddress(); + } + + uint64_t host_addr = 0; + auto ret = hsakmt_hsa_ven_amd_loader_query_host_address(reinterpret_cast(addr), + reinterpret_cast(&host_addr)); + if (ret == HSA_STATUS_SUCCESS) { + return host_addr; + } + pr_err("failed to query host address for kernel object %p, ret=%d\n", (void*)addr, ret); + return 0; +} + +void ComputeQueue::RingDoorbell() { + thread_cond_lock_.lock(); + thread_cond_lock_.unlock(); + pr_debug("notify %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n", + ring, GetRingWptr()->load(), GetRingRptr()->load()); + thread_cond_.notify_one(); +} + +hsa_status_t ComputeQueue::Init(void) { + hsa_status_t ret = use_hws ? HwsInit() : SwsInit(); + if (ret) + return ret; + + ib_start_addr = cmdbuf_addr; + cmdbuf_aql_frame_size = device->GetAqlFrameSize(); + platform_atomic_support_ = device->SupportPlatformAtomic(); + + return ret; +} + +hsa_status_t ComputeQueue::Fini(void) { + return use_hws ? HwsFini() : SwsFini(); +} + +hsa_status_t ComputeQueue::PreSubmit(void) { + if (!device->WaitPagingFence(this)) + return HSA_STATUS_ERROR; + + RelocateCmdbufScratchBase(ib_start_addr); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::EndSubmit(void) { + // record last submitted cmdbuf_aql_frame_write_index to see if GPU is hungry + sync_point = cmdbuf_aql_frame_write_index; + + ib_start_addr = cmdbuf_addr + + (cmdbuf_aql_frame_write_index % WDDMDevice::GetAqlFrameNum()) * + cmdbuf_aql_frame_size; + ib_size = 0; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::Submit(void) { + hsa_status_t ret = PreSubmit(); + if (ret) + return HSA_STATUS_ERROR; + + ret = use_hws ? + HwsSubmit(ib_start_addr, ib_size, cmdbuf_aql_frame_write_index) : + SwsSubmit(ib_start_addr, ib_size, cmdbuf_aql_frame_write_index); + if (ret) + return HSA_STATUS_ERROR; + + ret = EndSubmit(); + if (ret) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t +ComputeQueue::KernelDispatchAqlToPm4(char *cpu, hsa_kernel_dispatch_packet_t *packet) { + pr_debug("queue %p kernel dispatch head=%x setup=%x wx=%x wy=%x wz=%x " + "gx=%x gy=%x gz=%x ps=%x gs=%x ko=%" PRIx64 " ka=%p cs=%" PRIx64 "\n", + ring, packet->header, + packet->setup, packet->workgroup_size_x, packet->workgroup_size_y, + packet->workgroup_size_z, packet->grid_size_x, packet->grid_size_y, + packet->grid_size_z, packet->private_segment_size, + packet->group_segment_size, packet->kernel_object, packet->kernarg_address, + packet->completion_signal.handle); + + if (packet->workgroup_size_x > 1024 || + packet->workgroup_size_y > 1024 || + packet->workgroup_size_z > 1024) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + int major = device->Major(); + int i = ib_size; + + const amd_kernel_code_t* kernel_object = + (const amd_kernel_code_t *)GetKernelObjAddr(packet->kernel_object); + if (kernel_object == NULL) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + void* entry = (void*)(packet->kernel_object + kernel_object->kernel_code_entry_byte_offset); + assert((size_t)entry % AMD_ISA_ALIGN_BYTES == 0); + + pr_debug("kernel object property=%x entry=%p lds=%x+%x\n", + kernel_object->kernel_code_properties, entry, + kernel_object->workgroup_group_segment_byte_size, + packet->group_segment_size); + + if (packet->setup == 0 || packet->setup > 3) + return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; + if (packet->group_segment_size > device->LdsSize()) + return HSA_STATUS_ERROR_INVALID_ALLOCATION; + + uint32_t lds_blks = device->LdsBlocks(packet); + if (lds_blks > 128) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + const bool wave32 = + AMD_HSA_BITS_GET(kernel_object->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32); + + assert(packet->private_segment_size >= kernel_object->workitem_private_segment_byte_size); + + if (packet->private_segment_size != 0) + UpdateScratch(packet, wave32); + + amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle; + + // Record start timestamp when enabling profiling + if (signal && EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i); + + // Build a barrier packet if it is requested + const bool is_barrier_packet = (packet->header >> HSA_PACKET_HEADER_BARRIER) & 0x1; + if (is_barrier_packet && needs_barrier) + i += cmd_util.BuildBarrier(cpu + i); + + // flush cache + i += cmd_util.BuildAcquireMem(major, cpu + i); + + if (major >= 11) { + AppendCmdbufSratchBaseOffset( + i + offsetof(struct SetScratchTemplate, scratch_lo)); + + i += cmd_util.BuildScratch(ScratchBase(), cpu + i); + i += cmd_util.BuildComputeShaderParams(cpu + i); + } + + struct DispatchInfo info; + info.major = major; + info.pPacket = packet; + info.pEntry = entry; + info.pKernelObject = kernel_object; + info.ldsBlks = lds_blks; + info.pAmdQueue = amd_queue_; + info.wave32 = wave32; + info.srd = UpdateIndexStride( + info.pAmdQueue->scratch_resource_descriptor[3], wave32); + info.pScratchBase = ScratchBase(); + info.scratchSizePerWave = ScratchSizePerWave(); + memset(info.scratchBaseOffset, 0, sizeof(info.scratchBaseOffset)); + info.offsetCnt = 0; + + size_t size; + size = cmd_util.BuildDispatch(&info, cpu + i); + for (int j = 0; j < info.offsetCnt; j++) + AppendCmdbufSratchBaseOffset(i + info.scratchBaseOffset[j]); + i += size; + + needs_barrier = (packet->completion_signal.handle == 0); + + if (signal) { + // wait cs done + i += cmd_util.BuildBarrier(cpu + i); + + // Record end timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i); + + // flush cache + i += cmd_util.BuildAcquireMem(major, cpu + i); + + assert(signal->kind == AMD_SIGNAL_KIND_USER); + uint64_t *signal_addr = (uint64_t *)&signal->value; + pr_debug("signal value=%" PRIx64 "\n", signal->value); + + if (platform_atomic_support_) + i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1); + else + signal_addr_ = signal_addr; + } + + // The ring_rptr is used to record pm4 queue rptr value, + // dispatch readptr position, this is used to share rptr with + // aql queue. + if (platform_atomic_support_) + i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i); + else + i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1); + + // Check if we exceeded the frame size + if ((i - ib_size) > cmdbuf_aql_frame_size) { + pr_err("PM4 command buffer overflow in KernelDispatch: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + ib_size = i; + cmdbuf_aql_frame_write_index++; + packet->header = HSA_PACKET_TYPE_INVALID; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t +ComputeQueue::BarrierGenericAqlToPm4(char *cpu, hsa_barrier_and_packet_t *packet, bool is_or) { + pr_debug("queue %p %s head=%x dep %" PRIx64 " %" PRIx64 " %" PRIx64 + " %" PRIx64 " %" PRIx64 " cs=%" PRIx64"\n", + ring, is_or ? "or" : "and", + packet->header, packet->dep_signal[0].handle, + packet->dep_signal[1].handle, packet->dep_signal[2].handle, + packet->dep_signal[3].handle, packet->dep_signal[4].handle, + packet->completion_signal.handle); + // fix me: can we use gpu packet? + if (is_or) { + bool unsignaled = true; + hsa_signal_t sig[5]; + int n = 0; + for (int i = 0; i < 5; i++) { + if (packet->dep_signal[i].handle) + sig[n++] = packet->dep_signal[i]; + } + + while (n) { + for (int i = 0; i < n; i++) { + if (!hsakmt_hsa_signal_load_relaxed(sig[i])) { + unsignaled = false; + break; + } + } + if (!unsignaled) + break; + + std::this_thread::sleep_for(std::chrono::microseconds(20)); + } + } else { + for (int i = 0; i < 5; i++) { + if (!packet->dep_signal[i].handle) + continue; + + hsa_signal_value_t value = + hsakmt_hsa_signal_wait_relaxed(packet->dep_signal[i], HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + assert(value == 0); + } + } + + int major = device->Major(); + int i = ib_size; + + if (packet->completion_signal.handle != 0) { + amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle; + assert(signal->kind == AMD_SIGNAL_KIND_USER); + uint64_t *signal_addr = (uint64_t *)&signal->value; + pr_debug("signal value=%" PRIx64 "\n", signal->value); + + // Record start timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i); + + if (needs_barrier) + i += cmd_util.BuildBarrier(cpu + i); + + needs_barrier = false; + + // Record end timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i); + + // flush cache + i += cmd_util.BuildAcquireMem(major, cpu + i); + + if (platform_atomic_support_) + i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1); + else + signal_addr_ = signal_addr; + } + + // The ring_rptr is used to record pm4 queue rptr value, + // dispatch readptr position, this is used to share rptr with + // aql queue. + if (platform_atomic_support_) + i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i); + else + i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1); + + // Check if we exceeded the frame size + if ((i - ib_size) > cmdbuf_aql_frame_size) { + pr_err("PM4 command buffer overflow in BarrierGeneric: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + ib_size = i; + cmdbuf_aql_frame_write_index++; + packet->header = HSA_PACKET_TYPE_INVALID; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::VendorSpecificAqlToPm4(char *cpu, amd_aql_pm4_ib *packet) { + constexpr uint32_t AMD_AQL_FORMAT_PM4_IB = 0x1; + assert(packet->ven_hdr == AMD_AQL_FORMAT_PM4_IB); + + uint8_t op = (packet->ib_jump_cmd[0] >> PM4_OPCODE_SHIFT) & 0xff; + assert(op == IT_INDIRECT_BUFFER); + uint32_t* pm4_addr = reinterpret_cast((static_cast(packet->ib_jump_cmd[2]) << 32) | (static_cast(packet->ib_jump_cmd[1]) & ~3ull)); + uint32_t pm4_size = packet->ib_jump_cmd[3]&0xfffff; + pr_debug("queue %p %s VENDOR_SPECIFIC pkt pm4_addr %p pm4_size %#x cs=%" PRIx64"\n", + ring, dxg_runtime->vendor_packet_process ? "process" : "skip", pm4_addr, pm4_size, + packet->completion_signal.handle); + for (int i = 0; i < pm4_size; i++) { + pr_debug("pm4_addr[%d]=%#x\n", i, pm4_addr[i]); + } + + int i = ib_size; + + if (dxg_runtime->vendor_packet_process) { + int major = device->Major(); + memcpy(cpu+i, pm4_addr, pm4_size * sizeof(uint32_t)); + i += pm4_size * sizeof(uint32_t); + + if (packet->completion_signal.handle != 0) { + amd_signal_t *signal = (amd_signal_t *)packet->completion_signal.handle; + assert(signal->kind == AMD_SIGNAL_KIND_USER); + uint64_t *signal_addr = (uint64_t *)&signal->value; + pr_debug("signal value=%" PRIx64 "\n", signal->value); + + // Record start timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->start_ts, cpu + i); + + //if (needs_barrier) + i += cmd_util.BuildBarrier(cpu + i); + + //needs_barrier = false; + + // Record end timestamp when enabling profiling + if (EnableProfiling()) + i += cmd_util.BuildCopyData(&signal->end_ts, cpu + i); + + // flush cache + i += cmd_util.BuildAcquireMem(major, cpu + i); + + if (platform_atomic_support_) + i += cmd_util.BuildAtomicMem(signal_addr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i, cache_policy__mec_atomic_mem__bypass, -1); + else + signal_addr_ = signal_addr; + } + } else { + if (packet->completion_signal.handle != 0) { + hsakmt_hsa_signal_store_screlease(packet->completion_signal, 0); + } + } + + // The ring_rptr is used to record pm4 queue rptr value, + // dispatch readptr position, this is used to share rptr with + // aql queue. + if (platform_atomic_support_) + i += cmd_util.BuildAtomicMem((uint64_t *)ring_rptr, TC_OP_ATOMIC_ADD_RTN_64, cpu + i); + else + i += cmd_util.BuildWriteData64Command(cpu + i, (uint64_t *)ring_rptr, cmdbuf_aql_frame_write_index + 1); + + // Check if we exceeded the frame size + if ((i - ib_size) > cmdbuf_aql_frame_size) { + pr_err("PM4 command buffer overflow in VendorSpecific: used %d bytes, limit %d bytes\n", i - ib_size, cmdbuf_aql_frame_size); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + ib_size = i; + cmdbuf_aql_frame_write_index++; + packet->header = HSA_PACKET_TYPE_INVALID; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::SwitchAql2PM4(void) { + + uint16_t *packet = (uint16_t *) ((char *)ring + + (cmdbuf_aql_frame_write_index % ring_size) * 64); + uint16_t header = (*packet >> HSA_PACKET_HEADER_TYPE); + header &= (1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; + hsa_kernel_dispatch_packet_t *aql_packet = + (hsa_kernel_dispatch_packet_t *)packet; + hsa_status_t ret; + + switch (header) { + case HSA_PACKET_TYPE_KERNEL_DISPATCH: + ret = KernelDispatchAqlToPm4((char *)ib_start_addr, aql_packet); + if (ret != HSA_STATUS_SUCCESS) + return ret; + + // Stop merging packages util below conditions are met: + // 1) The kernel with completion signal; + // 2) The cmdbuf_aql_frame_write_index reaches the end of cmdbuf + // 3) The queue is empty now, submit the package right now. + if (!(aql_packet->completion_signal.handle) && + (cmdbuf_aql_frame_write_index % WDDMDevice::GetAqlFrameNum()) && + (*sync_addr != sync_point)) + return HSA_STATUS_SUCCESS; + + break; + case HSA_PACKET_TYPE_BARRIER_AND: + BarrierGenericAqlToPm4((char *)ib_start_addr, (hsa_barrier_and_packet_t *)aql_packet); + break; + case HSA_PACKET_TYPE_BARRIER_OR: + BarrierGenericAqlToPm4((char *)ib_start_addr, (hsa_barrier_and_packet_t *)aql_packet, true); + break; + case HSA_PACKET_TYPE_VENDOR_SPECIFIC: + VendorSpecificAqlToPm4((char *)ib_start_addr, (amd_aql_pm4_ib *)aql_packet); + break; + case HSA_PACKET_TYPE_INVALID: + // When packets are submitted out of order, the format field of current AQL packet + // may not have been updated yet and is still INVALID. Return HSA_STATUS_SUCCESS and + // do not process AQL packets before the packet format field is updated. + assert(false && "Should not reach here, HSA_PACKET_TYPE_INVALID has been filtered in upper layer"); + return HSA_STATUS_SUCCESS; + default: + return HSA_STATUS_ERROR_INVALID_PACKET_FORMAT; + } + + ready_to_submit = true; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ComputeQueue::Process(void) { + + while (cmdbuf_aql_frame_write_index < ring_wptr->load() && + !IsInvalidPacket()) { + pr_debug("process %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n", + ring, ring_wptr->load(), ring_rptr->load()); + + hsa_status_t ret; + + // wait for next few cmdbuf slots to be free + // If wptr catch up the rptr in the cmdbuf, this needs wait for the rptr to free the cmdbuf. + // Here the wptr comes from queue->cmdbuf_aql_frame_write_index, while rptr comes from *queue->sync_addr. + if (*sync_addr + WDDMDevice::GetAqlFrameNum() <= cmdbuf_aql_frame_write_index) { + uint64_t value = cmdbuf_aql_frame_write_index - WDDMDevice::GetAqlFrameNum() + 1; + if (!device->CpuWait(&syncobj, &value, 1, false)) + return HSA_STATUS_ERROR; + } + + ret = SwitchAql2PM4(); + if (ret != HSA_STATUS_SUCCESS) + return ret; + + if (!ready_to_submit) + continue; + + ret = Submit(); + if (ret != HSA_STATUS_SUCCESS) + return ret; + + // CPU wait for GPU fence, and cpu update the signal. + if (!platform_atomic_support_ && signal_addr_) { + // CPU wait for GPU fence + if (!device->CpuWait(&syncobj, &cmdbuf_aql_frame_write_index, 1, false)) + return HSA_STATUS_ERROR; + //CPU update completional signal + atomic::Decrement(signal_addr_); + signal_addr_ = NULL; + } + + ready_to_submit = false; + + pr_debug("done %p wptr=%" PRIx64 " rptr=%" PRIx64 "\n", + ring, ring_wptr->load(), ring_rptr->load()); + } + + return HSA_STATUS_SUCCESS; +} + +void SDMAQueue::SdmaThread(SDMAQueue *queue) { + + while (true) { + decltype(queue->wptr_queue_) pendings; + { + std::unique_lock lock(queue->thread_cond_lock_); + while (queue->wptr_queue_.empty() && !queue->thread_stop_) + queue->thread_cond_.wait(lock); + + if (queue->thread_stop_) + break; + + pendings.swap(queue->wptr_queue_); + } + + for (const auto [start, end] : pendings) { + pr_debug("wptr %lx %lx\n", start, end); + + SDMA_PKT_POLL_REGMEM* poll_pkt = reinterpret_cast(queue->cmdbuf_addr + queue->WrapIntoRocrRing(start)); + SDMA_PKT_POLL_REGMEM* poll_next_pkt = poll_pkt + 1; + while (queue->IsPollPacket(poll_pkt)) { + uint64_t poll_addr = poll_pkt->ADDR_LO_UNION.addr_31_0 | + (uint64_t)poll_pkt->ADDR_HI_UNION.addr_63_32 << 32; + + uint64_t poll_val = poll_pkt->VALUE_UNION.value; + uint32_t skip = 1; + + if (queue->IsPollPacket(poll_next_pkt)) { + uint64_t poll_next_addr = poll_next_pkt->ADDR_LO_UNION.addr_31_0 | + (uint64_t)poll_next_pkt->ADDR_HI_UNION.addr_63_32 << 32; + + if (poll_next_addr + sizeof(uint32_t) == poll_addr) { + poll_addr = poll_next_addr; + poll_val = poll_next_pkt->VALUE_UNION.value | + (uint64_t)poll_pkt->VALUE_UNION.value << 32; + skip = 2; + } + } + + amd_signal_t* signal = (amd_signal_t*)((char*)poll_addr - offsetof(amd_signal_t, value)); + uint64_t signal_handle = reinterpret_cast(signal); + pr_debug("poll signal %#lx addr %#lx val %ld\n", signal_handle, poll_addr, poll_val); + hsa_signal_t hsa_signal = {signal_handle}; + hsa_signal_value_t value = + hsakmt_hsa_signal_wait_relaxed(hsa_signal, HSA_SIGNAL_CONDITION_EQ, poll_val, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + assert(value == poll_val); + + memset(poll_pkt, 0, skip * sizeof(*poll_pkt)); + poll_pkt += skip; + poll_next_pkt += skip; + } + queue->PreparePacket(queue->WrapIntoRocrRing(start), end - start); + std::atomic_thread_fence(std::memory_order_release); + queue->Submit(); + } + } + pr_debug("sdma thread exit\n"); +} + +SDMAQueue::SDMAQueue(WDDMDevice *device, + void *ring, + uint64_t cmdbuf_size, + uint32_t engine, + bool use_hws) : + WDDMQueue(device, reinterpret_cast(ring), cmdbuf_size, engine, use_hws), + wptr_next_(0), + wptr_pre_(0), + rptr_next(0), + thread_stop_(false), + ib_size(0), + ib_start_addr(0) { + bool ret = device->CreateQueue(this); + assert(ret); + + thread_ = std::thread(SdmaThread, this); +} + +SDMAQueue::~SDMAQueue() { + thread_cond_lock_.lock(); + thread_stop_ = true; + thread_cond_lock_.unlock(); + thread_cond_.notify_one(); + thread_.join(); + + device->DestroyQueue(this); +} + +void SDMAQueue::RingDoorbell() { + pr_debug("ringdoorbell %#lx %#lx\n", wptr_pre_, wptr_next_); + thread_cond_lock_.lock(); + + wptr_queue_.emplace_back(wptr_pre_, wptr_next_); + thread_cond_.notify_one(); + + thread_cond_lock_.unlock(); + wptr_pre_ = wptr_next_; +} + +hsa_status_t SDMAQueue::Init(void) { + hsa_status_t ret = use_hws ? HwsInit() : SwsInit(); + if (ret) + return ret; + + std::memset((char *)cmdbuf_addr, 0, cmdbuf_size); + + return ret; +} + +hsa_status_t SDMAQueue::Fini(void) { + return use_hws ? HwsFini() : SwsFini(); +} + +int SDMAQueue::PreparePacket(uint32_t offset, uint64_t size) { + ib_start_addr = cmdbuf_addr + offset; + ib_size = size; + rptr_next += ib_size; + + return STATUS_SUCCESS; +} + +hsa_status_t SDMAQueue::Submit(void) { + if (!device->WaitPagingFence(this)) + return HSA_STATUS_ERROR; + + int ret = use_hws ? + HwsSubmit(ib_start_addr, ib_size, rptr_next) : + SwsSubmit(ib_start_addr, ib_size, rptr_next); + if (ret) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +} // namespace thunk +} // namespace wsl diff --git a/projects/rocr-runtime/libhsakmt/src/dxg/wddm/va_mgr.cpp b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/va_mgr.cpp new file mode 100644 index 0000000000..4ea93c70f2 --- /dev/null +++ b/projects/rocr-runtime/libhsakmt/src/dxg/wddm/va_mgr.cpp @@ -0,0 +1,165 @@ +#include +#include +#include +#include "impl/wddm/va_mgr.h" + +using namespace std; + +namespace wsl { +namespace thunk { + +VaMgr::VaMgr(uint64_t start, uint64_t size, uint64_t min_align) { + min_align_ = min_align; + auto free_it = free_list_.insert(make_pair(size, start)); + frag_map_[start] = make_fragment(free_it, size); +} + +VaMgr::~VaMgr() { + + if (free_list_.size() != 1) + pr_warn("free_list_ size:%ld which should be 1.\n", free_list_.size()); + if (frag_map_.size() != 1) + pr_warn("frag_map_ size:%ld which should be 1.\n", frag_map_.size()); + + free_list_.clear(); + frag_map_.clear(); +} + +uint64_t VaMgr::Alloc(uint64_t bytes, uint64_t align, uint64_t addr) { + + if (addr > 0 && + (align == 0 || (addr % align) == 0)) { + + lock_guard gard(lock_); + auto frag_it = frag_map_.upper_bound(addr); + assert(frag_it != frag_map_.begin()); + --frag_it; + + while (frag_it != frag_map_.begin()) { + const uint64_t base = frag_it->first; + const uint64_t size = frag_it->second.size; + + // Cannot find free fragment contains the target `addr` + if (bytes > size || addr < base || addr + bytes > base + size || + !is_free(frag_it->second)) { + --frag_it; + continue; + } else if (addr >= base + size) + break; + + + // Try to allocate target `addr` from this free fragment + auto free_it = frag_it->second.free_list_entry_; + assert(free_it != free_list_.end()); + + free_list_.erase(free_it); + frag_it->second.size = bytes; + set_used(frag_it->second); + + // [base, addr) + if (addr > base) add_free_fragment(addr - base, base); + + // [addr, addr + bytes) is used + + // [addr + bytes, base + size) + if (base + size > addr + bytes) add_free_fragment(base + size - addr - bytes, addr + bytes); + + return addr; + } + } + + // Allocate not fixed address + return AllocImpl(bytes, align); +} + +uint64_t VaMgr::AllocImpl(const uint64_t bytes, const uint64_t align) { + uint64_t addr = 0; + uint64_t align_bytes = bytes; + const int retry = align == 0 ? 0 : 1; + const uint64_t new_align = align == 0 ? min_align_ : AlignUp(align, min_align_); + + lock_guard gard(lock_); + for (int i = 0; i <= retry; i++) { + auto free_it = free_list_.lower_bound(align_bytes); + if (free_it == free_list_.end()) break; + + uint64_t base = free_it->second; + uint64_t size = free_it->first; + + assert(size >= align_bytes); + + auto fragment = frag_map_.find(base); + + assert(fragment != frag_map_.end()); + assert(size == fragment->second.size); + + uint64_t delta = align == 0 ? 0 : base % align; + if (delta == 0) { + // already find aligned address + addr = base; + + free_list_.erase(free_it); + fragment->second.size = bytes; + set_used(fragment->second); + + if (size > bytes) add_free_fragment(size - bytes, base + bytes); + + break; + } else if (i == 0) { + align_bytes += new_align; + continue; + } else { + uint64_t aligned_base = base + align - delta; + addr = aligned_base; + + free_list_.erase(free_it); + + add_used_fragment(bytes, aligned_base); + add_free_fragment(aligned_base - base, base); + + if (size > aligned_base - base + bytes) + add_free_fragment(size - (aligned_base - base) - bytes, aligned_base + bytes); + + break; + } + } + return addr; +} + +void VaMgr::Free(uint64_t addr) { + if (addr == 0) return; + + lock_guard gard(lock_); + auto frag_it = frag_map_.find(addr); + if (frag_it == frag_map_.end() || is_free(frag_it->second)) return; + + uint64_t base = addr; + // Merge lower + if (frag_it != frag_map_.begin()) { + auto lower = frag_it; + --lower; + if (is_free(lower->second)) { + remove_free_list_entry(lower->second); + base -= lower->second.size; + lower->second.size += frag_it->second.size; + frag_map_.erase(frag_it); + frag_it = lower; + } + } + // Merge upper + { + auto upper = frag_it; + ++upper; + if (upper != frag_map_.end() && is_free(upper->second)) { + remove_free_list_entry(upper->second); + frag_it->second.size += upper->second.size; + frag_map_.erase(upper); + } + } + uint64_t size = frag_it->second.size; + auto it = free_list_.insert(make_pair(size, base)); + set_free(frag_it->second, it); +} + +} // namespace thunk +} // namespace wsl